├── .gitignore ├── simpledbf ├── __init__.py └── simpledbf.py ├── setup.py ├── LICENSE.txt ├── RELEASENOTES.rst └── README.rst /.gitignore: -------------------------------------------------------------------------------- 1 | *swp 2 | TODO* 3 | *pyc 4 | .pypirc 5 | dist/ 6 | simpledbf.egg* 7 | -------------------------------------------------------------------------------- /simpledbf/__init__.py: -------------------------------------------------------------------------------- 1 | from .simpledbf import Dbf5 2 | 3 | __all__ = ['Dbf5',] 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open('README.rst') as file: 4 | long_description = file.read() 5 | 6 | setup( 7 | name = "simpledbf", 8 | version = "0.2.6", 9 | 10 | description = "Convert DBF files to CSV, DataFrames, HDF5 tables, and "\ 11 | "SQL tables. Python3 compatible.", 12 | url = "https://github.com/rnelsonchem/simpledbf", 13 | long_description = long_description, 14 | 15 | author = "Ryan Nelson", 16 | author_email = "rnelsonchem@gmail.com", 17 | 18 | license = "BSD", 19 | classifiers = [ 20 | 'Development Status :: 4 - Beta', 21 | 'Intended Audience :: Developers', 22 | 'Intended Audience :: Science/Research', 23 | 'License :: OSI Approved :: BSD License', 24 | 'Programming Language :: Python :: 2', 25 | 'Programming Language :: Python :: 2.7', 26 | 'Programming Language :: Python :: 3', 27 | 'Programming Language :: Python :: 3.4', 28 | ], 29 | 30 | keywords = "DBF CSV Pandas SQLalchemy PyTables DataFrame SQL HDF", 31 | 32 | packages = find_packages(), 33 | 34 | ) 35 | 36 | 37 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Ryan Nelson. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of the NumPy Developers nor the names of any 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /RELEASENOTES.rst: -------------------------------------------------------------------------------- 1 | simpledbf 0.2.6 Release Notes 2 | ############################# 3 | 4 | Bug Fixes 5 | --------- 6 | 7 | * Fixed the pure-CSV output function. Fix contributed by @sarasafavi. 8 | 9 | simpledbf 0.2.4 Release Notes 10 | ############################# 11 | 12 | Highlights 13 | ---------- 14 | 15 | * Added `data_columns`, `complib`, and `complevel` keyword arguments to the 16 | `to_pandashdf` method. The function of these mirrors the equivalent 17 | arguments used in the Pandas HDFStore object and HDFStore.append method. See 18 | the Pandas docs for more information. 19 | 20 | Bug Fixes 21 | --------- 22 | 23 | * Added an index column to the CSV-based SQL output. This is necessary for 24 | selecting data from the SQL file, and also reading the SQL table back into a 25 | DataFrame at a later time. 26 | 27 | simpledbf 0.2.3 Release Notes 28 | ############################# 29 | 30 | Highlights 31 | ---------- 32 | 33 | * Added a pure-Python SQL option which writes a SQL table creation file and a 34 | header-less CSV file. This is much more efficient for uploading large data 35 | files to SQL. 36 | 37 | API Changes 38 | ----------- 39 | 40 | * The default NaN value for float/int columns was changed to always be 41 | ``float('nan')``. This is necessary for DBF->SQL->DF conversion, even though 42 | the CSV files now have 'nan' for all empty values. 43 | 44 | simpledbf 0.2.2 Release Notes 45 | ############################# 46 | 47 | Highlights 48 | ---------- 49 | 50 | * Added an optional 'codec' keyword argument to Dbf5 __init__, which controls 51 | the decoding of the values in the DBF file and CSV output file. Default is 52 | 'utf-8' 53 | 54 | * Made a couple small algorithmic changes that improved performance. 55 | 56 | Bug Fixes 57 | --------- 58 | 59 | * The 'na' flag now works properly. (In previous versions, it was always 60 | setting empty values to the string 'nan'). 61 | 62 | * Properly set the string column width for HDF chunksize-only output. (The 63 | column width is set to max(string len) by default, which may not be the 64 | largest for every chunk. Used the dbf header info to fix this.) 65 | 66 | simpledbf 0.2.1 Release Notes 67 | ############################# 68 | 69 | Highlights 70 | ---------- 71 | 72 | * Added a 'na' keyword argument that controls the value of missing/bad data. 73 | 74 | * Set the default 'na' to the empty string ('') for CSV and NaN ('nan') for 75 | all others exports. 76 | 77 | simpledbf 0.2.0 Release Notes 78 | ############################# 79 | 80 | Functionality stays the same, but a few implementation details have changed. 81 | Tested with Python2, and everything except HDF export works fine. 82 | 83 | Highlights 84 | ---------- 85 | 86 | * Empty strings are converted to Nan (ie `float('nan')`). 87 | 88 | * Added try/except clauses to all other types, so poorly formatted values 89 | will be returned as NaN as well. This may not be the behavior that is 90 | expected, so be careful. 91 | 92 | simpledbf 0.1.0 Release Notes 93 | ############################# 94 | 95 | First release. 96 | 97 | Highlights 98 | ---------- 99 | 100 | * Pure-Python3 read of DBF files 101 | 102 | * Pure-Python3 write as CSV 103 | 104 | * Convert to DataFrame (Pandas required) 105 | 106 | * Convert to HDF5 table (Pandas and PyTables required) 107 | 108 | * Convert to SQL table (Pandas and SQLalchemy required) 109 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | simpledbf 2 | ######### 3 | 4 | *simpledbf* is a Python library for converting basic DBF files (see 5 | `Limitations`_) to CSV files, Pandas DataFrames, SQL tables, or HDF5 tables. 6 | This package is fully compatible with Python >=3.4, with almost complete 7 | `Python 2.7 support`_ as well. The conversion to CSV and SQL (see 8 | ``to_textsql`` below) is entirely written in Python, so no additional 9 | dependencies are necessary. For other export formats, see `Optional 10 | Requirements`_. This code was designed to be very simple, fast and memory 11 | efficient for convenient interactive or batch file processing; therefore, it 12 | lacks many features, such as the ability to write DBF files, that other 13 | packages might provide. 14 | 15 | Bug fixes, questions, and update requests are encouraged and can be filed at 16 | the `GitHub repo`_. 17 | 18 | This code is derived from an `ActiveState DBF example`_ that works with 19 | Python2 and is distributed under a PSF license. 20 | 21 | 22 | .. _Optional Requirements: 23 | 24 | Optional Requirements 25 | --------------------- 26 | 27 | * Pandas >= 0.15.2 (Required for DataFrame) 28 | 29 | * PyTables >= 3.1 (with Pandas required for HDF tables) 30 | 31 | * SQLalchemy >= 0.9 (with Pandas required for DataFrame-SQL tables) 32 | 33 | Installation 34 | ------------ 35 | 36 | The most recent release of *simpledbf* can be installed using ``pip`` or 37 | ``conda``, if you happen to be using the `Anaconda Python distribution`_. 38 | 39 | Using ``conda``:: 40 | 41 | $ conda install -c https://conda.binstar.org/rnelsonchem simpledbf 42 | 43 | Using ``pip``:: 44 | 45 | $ pip install simpledbf 46 | 47 | The development version can be installed from GitHub:: 48 | 49 | $ pip install git+https://github.com/rnelsonchem/simpledbf.git 50 | 51 | As an alternative, this package only contains a single file, so in principle, 52 | you could download the ``simpledbf.py`` file from Github and put it in any 53 | folder of your choosing. 54 | 55 | 56 | .. _Limitations: 57 | 58 | DBF File Limitations 59 | -------------------- 60 | 61 | This package currently supports a subset of `dBase III through 5`_ DBF files. 62 | In particular, support is missing for linked memo (i.e. DBT) files. This is 63 | mostly due to limitations in the types of files available to the author. Feel 64 | free to request an update if you can supply a DBF file with an associated memo 65 | file. `DBF version 7`_, the most recent DBF file spec, is not currently 66 | supported by this package. 67 | 68 | 69 | .. _Python 2.7 support: 70 | 71 | Python 2 Support 72 | ---------------- 73 | 74 | Except for HDF file export, this code should work fine with Python >=2.7. 75 | However, HDF files created in Python3 are compatible with all Python2 HDF 76 | packages, so in principle, you could make any HDF files in a temporary Python3 77 | environment. If you are using the `Anaconda Python distribution`_ 78 | (recommended), then you can make a small Python3 working environment as 79 | follows: 80 | 81 | .. code:: 82 | 83 | $ conda create -n dbf python=3 pandas pytables sqlalchemy 84 | # Lots of output... 85 | 86 | $ source activate dbf 87 | 88 | dbf>$ conda install -c https://conda.binstar.org/rnelsonchem simpledbf 89 | 90 | dbf>$ python my_py3_hdf_creation_script.py 91 | # This is using Python3 92 | 93 | dbf>$ source deactivate 94 | 95 | $ python my_py2_stuff_with_hdf.py 96 | # This is using Python2 again 97 | 98 | HDF file export is currently broken in Python2 due to a `limitation in Pandas 99 | HDF export with unicode`_. This issue may be fixed future versions of 100 | Pandas/PyTables. 101 | 102 | 103 | Example Usage 104 | ############# 105 | 106 | .. _Loading: 107 | 108 | Load a DBF file 109 | --------------- 110 | 111 | This module currently only defines a single class, ``Dbf5``, which is 112 | instantiated with a DBF file name, which can contain path info as well. An 113 | optional 'codec' keyword argument that controls the codec used for 114 | reading/writing files. The default is 'utf-8'. See the documentation for 115 | Python's `codec standard library module`_ for more codec options. 116 | 117 | .. code:: 118 | 119 | In : from simpledbf import Dbf5 120 | 121 | In : dbf = Dbf5('fake_file_name.dbf', codec='utf-8') 122 | 123 | The ``Dbf5`` object initially only reads the header information from the file, 124 | so you can inspect some of the properties. For example, ``numrec`` is the 125 | number of records in the DBF file, and ``fields`` is a list of tuples with 126 | information about the data columns. See the DBF file spec for info on the 127 | column type characters. The "DeletionFlag" column is always present as a check 128 | for deleted records; however, it is never exported during conversion. 129 | 130 | .. code:: 131 | 132 | In : dbf.numrec 133 | Out: 10000 134 | 135 | In : dbf.fields 136 | Out: [('DeletionFlag', 'C', 1), ('col_1', 'C', 15), ('col_2', 'N', 2)] 137 | 138 | The docstring for this object contains a complete listing of attributes and 139 | their descriptions. 140 | 141 | The ``mem`` method gives an approximate memory requirement for processing this 142 | DBF file. (~2x the total file size, which could be wildly inaccurate.) In 143 | addition, all of the output methods in this object take a ``chunksize`` 144 | keyword argument, which lets you split up the processing of large files into 145 | smaller chunks to limit the total memory usage of the conversion process. When 146 | this keyword argument is passed into ``mem``, the approximate memory footprint 147 | of the chunk will also be given, which can be useful when trying to determine 148 | the maximum chunksize your memory will allow. 149 | 150 | .. code:: 151 | 152 | In : dbf.mem() 153 | This total process would require more than 350.2 MB of RAM. 154 | 155 | In : dbf.mem(chunksize=1000) 156 | Each chunk will require 4.793 MB of RAM. 157 | This total process would require more than 350.2 MB of RAM. 158 | 159 | 160 | Export the Data 161 | --------------- 162 | 163 | The ``Ddb5`` object behaves like Python's file object in that it will be 164 | "exhausted" after export. To re-export the DBF data to a different format, 165 | first create a new ``Dbf5`` instance using the same file name. This procedure 166 | is followed in the documentation below. 167 | 168 | 169 | Note on Empty/Bad Data 170 | ++++++++++++++++++++++ 171 | 172 | This package attempts to convert most blank strings and poorly formatted 173 | values to an empty value of your choosing. This is controlled by the ``na`` 174 | keyword argument to all export functions. The default for CSV is an empty 175 | string (''), and for all other exports, it is 'nan' which converts empty/bad 176 | values to ``float('nan')``. *NOTE* The exception here is that float/int 177 | columns always use ``float('nan')`` for all missing values for 178 | DBF->SQL->DataFrame conversion purposes. Pandas has very powerful functions 179 | for `working with missing data`_, including converting NaN to other values 180 | (e.g. empty strings). 181 | 182 | 183 | To CSV 184 | ++++++ 185 | 186 | Use the ``to_csv`` method to export the data to a CSV file. This method 187 | requires the name of a CSV file as an input. The default behavior is to append 188 | new data to an existing file, so be careful if the file already exists. The 189 | ``chunksize`` keyword argument controls the frequency that the file buffer 190 | will be flushed, which may not be necessary. The ``na`` keyword changes the 191 | value used for missing/bad entries (default is ''). The keyword ``header`` is 192 | a boolean that controls writing of the column names as the first row of the 193 | CSV file. The encoding of the resulting CSV file is determined by the codec 194 | that is set when opening the DBF file, see `Loading`_. 195 | 196 | .. code:: 197 | 198 | In : dbf = Dbf5('fake_file_name.dbf') 199 | 200 | In : dbf.to_csv('junk.csv') 201 | 202 | If you are unhappy with the default CSV output of this module, Pandas also has 203 | very `powerful CSV export capabilities`_ for DataFrames. 204 | 205 | 206 | To SQL (CSV-based) 207 | ++++++++++++++++++ 208 | 209 | Most SQL databases can create tables directly from local CSV files. The 210 | pure-Python ``to_textsql`` method creates two files: 1) a header-less CSV file 211 | containing the DBF contents, and 2) a SQL file containing the appropriate 212 | table creation and CSV import code. It is up to you to run the SQL file as a 213 | separate step. This function takes two mandatory arguments, which are simply 214 | the names of the SQL and CSV files, respectively. In addition, there are a 215 | number of optional keyword arguments as well. ``sqltype`` controls the output 216 | dialect. The default is 'sqlite', but 'postgres' is also accepted. ``table`` 217 | sets the name of the SQL table that will be created. By default, this will be 218 | the name of the DBF file without the file extension. You should escape quote 219 | characters (") in the CSV file. This is controlled with the ``escapeqoute`` 220 | keyword, which defaults to ``'"'``. (This changes '"' in text strings to '""', 221 | which the SQL server should ignore.) The ``chunksize``, ``na``, and ``header`` 222 | keywords are used to control the CSV file. See above. 223 | 224 | Here's an example for SQLite: 225 | 226 | .. code:: 227 | 228 | In : dbf = Dbf5('fake_file_name.dbf') 229 | 230 | In : dbf.to_textsql('junk.sql', 'junk.csv') 231 | 232 | # Exit Python 233 | $ sqlite3 junk.db < junk.sql 234 | 235 | Here's an example for Postgresql: 236 | 237 | .. code:: 238 | 239 | In : dbf = Dbf5('fake_file_name.dbf') 240 | 241 | In : dbf.to_textsql('junk.sql', 'junk.csv', sqltype='postgres') 242 | 243 | # Exit Python 244 | $ psql -U username -f junk.sql db_name 245 | 246 | To DataFrame 247 | ++++++++++++ 248 | 249 | The ``to_dataframe`` method returns the DBF records as a Pandas DataFrame. If 250 | the size of the DBF file exceeds available memory, then passing the 251 | ``chunksize`` keyword argument will return a generator function. This 252 | generator yields DataFrames of len(<=chunksize) until all of the records have 253 | been processed. The ``na`` keyword changes the value used for missing/bad 254 | entries (default is 'nan' which inserts ``float('nan')``). 255 | 256 | .. code:: 257 | 258 | In : dbf = Dbf5('fake_file_name.dbf') 259 | 260 | In : df = dbf.to_dataframe() 261 | # df is a DataFrame with all records 262 | 263 | In : dbf = Dbf5('fake_file_name.dbf') 264 | 265 | In : for df in dbf.to_dataframe(chunksize=10000) 266 | .... do_cool_stuff(df) 267 | # Here a generator is returned 268 | 269 | .. _chunksize issue: 270 | 271 | Issue with DataFrame Chunksize 272 | ++++++++++++++++++++++++++++++ 273 | 274 | When a DataFrame is constructed, it attempts to determine the dtype of each 275 | column. If you chunk the DataFrame output, it turns out that the dtype for a 276 | column can change. For example, if one chunk has a column with all strings, 277 | the dtype will be ``np.object``; however, if in the next chunk that same 278 | column is full of ``float('nan')``, the resulting dtype will be set as 279 | ``float``. This has some consequences for writing to SQL and HDF tables as 280 | well. In principle, this behavior could be changed, but it is currently 281 | non-trivial to set the dtypes for DataFrame columns on construction. Please 282 | file a PR through GitHub if this is a big problem. 283 | 284 | 285 | To an SQL Table using Pandas 286 | ++++++++++++++++++++++++++++ 287 | 288 | The ``to_pandassql`` method will transfer the DBF entries to an SQL database 289 | table of your choice using a combination of Pandas DataFrames and SQLalchemy. 290 | A valid `SQLalchemy engine string`_ argument is required to connect with the 291 | database. Database support will be limited to those supported by SQLalchemy. 292 | (This has been tested with SQLite and Postgresql.) Note, if you are 293 | transferring a large amount of data, this method will be very slow. If you 294 | have direct access to the SQL server, you might want to use the text-based SQL 295 | export instead. 296 | 297 | .. code:: 298 | 299 | In : dbf = Dbf5('fake_file_name.dbf') 300 | 301 | In : dbf.to_pandassql('sqlite:///foo.db') 302 | 303 | This method accepts three optional arguments. ``table`` is the name of the 304 | table you'd like to use. If this is not passed, your new table will have the 305 | same name as the DBF file without file extension. Again, the default here is 306 | to append to an existing table. If you want to start fresh, delete the 307 | existing table before using this function. The ``chunksize`` keyword processes 308 | the DBF file in chunks of records no larger than this size. The ``na`` keyword 309 | changes the value used for missing/bad entries (default is 'nan' which inserts 310 | ``float('nan')``). 311 | 312 | .. code:: 313 | 314 | In : dbf = Dbf5('fake_file_name.dbf') 315 | 316 | In : dbf.to_pandassql('sqlite:///foo.db', table="fake_tbl", 317 | .... chunksize=100000) 318 | 319 | 320 | To an HDF5 Table 321 | ++++++++++++++++ 322 | 323 | The ``to_pandashdf`` method transfers the DBF entries to an HDF5 table of your 324 | choice. This method uses a combination of Pandas DataFrames and PyTables, so 325 | both of these packages must be installed. This method requires a file name 326 | string for the HDF file, which will be created if it does not exist. Again, 327 | the default behavior is to append to an existing file of that name, so be 328 | careful here. The HDF file will be created using the highest level of 329 | compression (9) with the 'blosc' compression lib. This saves an enormous 330 | amount of disk space, with little degradation of performance; however, this 331 | compression library is non-standard, which can cause problems with other HDF 332 | libraries. Compression options are controlled use the ``complib`` and 333 | ``complevel`` keyword arguments, which are identical to the ones described in 334 | the `Pandas HDF compression docs`_. 335 | 336 | .. code:: 337 | 338 | In : dbf = Dbf5('fake_file_name.dbf') 339 | 340 | In : dbf.to_pandashdf('fake.h5') 341 | 342 | This method uses the same optional arguments, and corresponding defaults, as 343 | ``to_pandassql`` (see above). A example with ``chunksize`` is shown below. In 344 | addition, a ``data_columns`` keyword argument is also available, which sets 345 | the columns that will be used as data columns in the HDF table. Data columns 346 | can be used for advanced searching and selection; however, there is some 347 | degredation of preformance for large numbers of data columns. See the `Pandas 348 | data columns docs`_ for a more detailed explanation. 349 | 350 | .. code:: 351 | 352 | In : dbf = Dbf5('fake_file_name.dbf') 353 | 354 | In : dbf.to_pandashdf('fake.h5', table="fake_tbl", chunksize=100000) 355 | 356 | See the `chunksize issue`_ for DataFrame export for information on a potential 357 | problem you may encounter with chunksize. 358 | 359 | 360 | Batch Export 361 | ++++++++++++ 362 | 363 | Batch file export is trivial using *simpledbf*. For example, the following 364 | code processes all DBF files in the current directory into separate tables in 365 | a single HDF file. 366 | 367 | .. code:: 368 | 369 | In : import os 370 | 371 | In : from simpledbf import Dbf5 372 | 373 | In : files = os.listdir('.') 374 | 375 | In : for f in files: 376 | .... if f[-3:].lower() == 'dbf': 377 | .... dbf = Dbf5(f) 378 | .... dbf.to_pandashdf('all_data.h5') 379 | 380 | 381 | .. External Hyperlinks 382 | 383 | .. _ActiveState DBF example: http://code.activestate.com/recipes/ 384 | 362715-dbf-reader-and-writer/ 385 | .. _GitHub repo: https://github.com/rnelsonchem/simpledbf 386 | .. _dBase III through 5: http://ulisse.elettra.trieste.it/services/doc/ 387 | dbase/DBFstruct.htm 388 | .. _DBF version 7: http://www.dbase.com/KnowledgeBase/int/db7_file_fmt.htm 389 | .. _Anaconda Python distribution: http://continuum.io/downloads 390 | .. _limitation in Pandas HDF export with unicode: http://pandas.pydata.org/ 391 | pandas-docs/stable/io.html#datatypes 392 | .. _codec standard library module: https://docs.python.org/3.4/library/ 393 | codecs.html 394 | .. _working with missing data: http://pandas.pydata.org/pandas-docs/stable/ 395 | missing_data.html 396 | .. _powerful CSV export capabilities: http://pandas.pydata.org/pandas-docs/ 397 | stable/io.html#writing-to-csv-format 398 | .. _SQLalchemy engine string: http://docs.sqlalchemy.org/en/rel_0_9/core/ 399 | engines.html 400 | .. _Pandas HDF compression docs: http://pandas.pydata.org/pandas-docs/stable/ 401 | io.html#compression 402 | .. _Pandas data columns docs: http://pandas.pydata.org/pandas-docs/stable/ 403 | io.html#query-via-data-columns 404 | 405 | 406 | 407 | -------------------------------------------------------------------------------- /simpledbf/simpledbf.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import datetime 3 | import os 4 | import codecs 5 | 6 | # Check for optional dependencies. 7 | try: 8 | import pandas as pd 9 | except: 10 | print("Pandas is not installed. No support for DataFrames, HDF, or SQL.") 11 | else: 12 | try: 13 | import tables as tb 14 | except: 15 | print("PyTables is not installed. No support for HDF output.") 16 | try: 17 | import sqlalchemy as sql 18 | except: 19 | print("SQLalchemy is not installed. No support for SQL output.") 20 | 21 | sqltypes = { 22 | 'sqlite': {'str':'TEXT', 'float':'REAL', 'int': 'INTEGER', 23 | 'date':'TEXT', 'bool':'INTEGER', 24 | 'end': '.mode csv {table}\n.import {csvname} {table}', 25 | 'start': 'CREATE TABLE {} (\n', 26 | 'index': '"index" INTEGER PRIMARY KEY ASC', 27 | }, 28 | 'postgres': {'str': 'text', 'float': 'double precision', 29 | 'int':'bigint', 'date':'date', 'bool':'boolean', 30 | 'end': '''\copy "{table}" from '{csvname}' delimiter ',' csv''', 31 | 'start': 'CREATE TABLE "{}" (\n', 32 | 'index': '"index" INTEGER PRIMARY KEY', 33 | }, 34 | } 35 | 36 | class DbfBase(object): 37 | ''' 38 | Base class for DBF file processing objects. 39 | 40 | Do not instantiate this class. This provides some of the common functions 41 | for other subclasses. 42 | ''' 43 | def _chunker(self, chunksize): 44 | '''Return a list of chunk ints from given chunksize. 45 | 46 | Parameters 47 | ---------- 48 | chunksize : int 49 | The maximum chunk size 50 | 51 | Returns 52 | ------- 53 | list of ints 54 | A list of chunks necessary to break up a given file. These will 55 | all be equal to `chunksize`, except for the last value, which is 56 | the remainder (<= `chunksize). 57 | ''' 58 | num = self.numrec//chunksize 59 | # Chunksize bigger than numrec 60 | if num == 0: 61 | return [self.numrec,] 62 | else: 63 | chunks = [chunksize,]*num 64 | remain = self.numrec%chunksize 65 | if remain != 0: 66 | chunks.append(remain) 67 | return chunks 68 | 69 | def _na_set(self, na): 70 | '''Set the value used for missing/bad data. 71 | 72 | Parameters 73 | ---------- 74 | na : various types accepted 75 | The value that will be used to replace missing or malformed 76 | entries. Right now this accepts pretty much anything, and that 77 | value will be used as a replacement. (May not do what you expect.) 78 | However, the strings 'na' or 'nan' (case insensitive) will insert 79 | float('nan'), the string 'none' (case insensitive) or will insert 80 | the Python object `None`. Float/int columns are always 81 | float('nan') regardless of this setting. 82 | ''' 83 | if na.lower() == 'none': 84 | self._na = None 85 | elif na.lower() in ('na', 'nan'): 86 | self._na = float('nan') 87 | else: 88 | self._na = na 89 | 90 | def mem(self, chunksize=None): 91 | '''Print the memory usage for processing the DBF File. 92 | 93 | Parameters 94 | ---------- 95 | chunksize : int, optional 96 | The maximum chunk size that will be used to process this file. 97 | 98 | Notes 99 | ----- 100 | This method will print the maximum amount of RAM that will be 101 | necessary to process and load the DBF file. (This is ~2x the file 102 | size.) However, if the optional chunksize is passed, this function 103 | will also print memory usage per chunk as well, which can be useful 104 | for efficiently chunking and processing a file. 105 | ''' 106 | if chunksize: 107 | if chunksize > self.numrec: 108 | print("Chunksize larger than number of recs.") 109 | print("Chunksize set to {:d}.".format(self.numrec)) 110 | else: 111 | smallmem = 2.*(self.fmtsiz*chunksize/1024**2) 112 | chkout = "Each chunk will require {:.4g} MB of RAM." 113 | print(chkout.format(smallmem)) 114 | memory = 2.*(self.fmtsiz*self.numrec/1024**2) 115 | out = "This total process would require more than {:.4g} MB of RAM." 116 | print(out.format(memory)) 117 | 118 | def to_csv(self, csvname, chunksize=None, na='', header=True): 119 | '''Write DBF file contents to a CSV file. 120 | 121 | Parameters 122 | ---------- 123 | csvname : string 124 | The name of the CSV file that will be created. By default, the 125 | file will be opened in 'append' mode. This won't delete an already 126 | existing file, but it will add new data to the end. May not be 127 | what you want. 128 | 129 | chunksize : int, optional 130 | If this is set, the contents of the file buffer will be flushed 131 | after processing this many records. May be useful for very large 132 | files that exceed the available RAM. 133 | 134 | na : various types accepted, optional 135 | The value that will be used to replace missing or malformed 136 | entries. Right now this accepts pretty much anything, and that 137 | value will be used as a replacement. (May not do what you expect.) 138 | However, the strings 'na' or 'nan' (case insensitive) will insert 139 | float('nan'), the string 'none' (case insensitive) or will insert 140 | the Python object `None`. Default for CSV is an empty string (''); 141 | however, float/int columns are always float('nan'). 142 | 143 | header : boolean, optional 144 | Write out a header line with the column names. Default is True. 145 | ''' 146 | self._na_set(na) 147 | # set index column; this is only True when used with to_textsql() 148 | self._idx = False 149 | csv = codecs.open(csvname, 'a', encoding=self._enc) 150 | if header: 151 | column_line = ','.join(self.columns) 152 | csv.write(column_line + '\n') 153 | 154 | # Build up a formatting string for output. 155 | outs = [] 156 | for field in self.fields: 157 | if field[0] == "DeletionFlag": 158 | # Add an index column placeholder 159 | if self._idx: 160 | outs.append('{}') 161 | else: 162 | continue 163 | # Wrap strings in quotes 164 | elif field[1] in 'CDL': 165 | outs.append('"{}"') 166 | elif field[1] in 'NF': 167 | outs.append('{}') 168 | # Make the outline unicode or it won't write out properly for UTF-8 169 | out_line = u','.join(outs) + '\n' 170 | 171 | count = 0 172 | for n, result in enumerate(self._get_recs()): 173 | if self._idx: 174 | out_string = out_line.format(n, *result) 175 | else: 176 | out_string = out_line.format(*result) 177 | 178 | csv.write(out_string) 179 | count += 1 180 | if count == chunksize: 181 | csv.flush() 182 | count = 0 183 | csv.close() 184 | 185 | def to_textsql(self, sqlname, csvname, sqltype='sqlite', table=None, 186 | chunksize=None, na='', header=False, escapequote='"'): 187 | '''Write a SQL input file along with a CSV File. 188 | 189 | This function generates a header-less CSV file along with an SQL input 190 | file. The SQL file creates the database table and imports the CSV 191 | data. This works sqlite and postgresql. 192 | 193 | Parameters 194 | ---------- 195 | sqlname : str 196 | Name of the SQL text file that will be created. 197 | 198 | csvname : str 199 | Name of the CSV file to be generated. See `to_csv`. 200 | 201 | sqltype : str, optional 202 | SQL dialect to use for SQL file. Default is 'sqlite'. Also accepts 203 | 'postgres' for Postgresql. 204 | 205 | table : str or None, optional 206 | Table name to generate. If None (default), the table name will be 207 | the name of the DBF input file without the file extension. 208 | Otherwise, the given string will be used. 209 | 210 | chunksize : int, option 211 | Number of chunks to process CSV creation. Defalut is None. See 212 | `to_csv`. 213 | 214 | na : various types accepted, optional 215 | Type to use for missing values. Default is ''. See `to_csv`. 216 | 217 | header : bool, optional 218 | Write header to the CSV output file. Default is False. Some SQL 219 | engines try to process a header line as data, which can be a 220 | problem. 221 | 222 | escapequote : str, optional 223 | Use this character to escape quotes (") in string columns. The 224 | default is `'"'`. For sqlite and postgresql, a double quote 225 | character in a text string is treated as a single quote. I.e. '""' 226 | is converted to '"'. 227 | ''' 228 | # Create an index column 229 | self._idx = True 230 | # Set the quote escape 231 | self._esc = escapequote 232 | # Get a dictionary of type conversions for a particular sql dialect 233 | sqldict = sqltypes[sqltype] 234 | # Create table name if not given 235 | if not table: 236 | table = self.dbf[:-4] # strip trailing ".dbf" 237 | # Write the csv file 238 | self.to_csv(csvname, chunksize=chunksize, na=na, header=header) 239 | 240 | # Write the header for the table creation. 241 | sql = codecs.open(sqlname, 'w', encoding=self._enc) 242 | head = sqldict['start'] 243 | sql.write(head.format(table)) 244 | 245 | # Make an output string and container for all strings. 246 | out_str = '"{}" {}' 247 | outs = [] 248 | for field in self.fields: 249 | name, typ, size = field 250 | # Skip the first field 251 | if name == "DeletionFlag": 252 | continue 253 | 254 | # Convert Python type to SQL type 255 | if name in self._dtypes: 256 | dtype = self._dtypes[name] 257 | outtype = sqldict[dtype] 258 | else: 259 | # If the column does not have a type, probably all missing 260 | # Try out best to make it the correct type for self._na 261 | if typ == 'C': 262 | outtype = sqldict['str'] 263 | elif typ in 'NF': 264 | outtype = sqldict['float'] 265 | elif typ == 'L': 266 | outtype = sqldict['bool'] 267 | elif typ == 'D': 268 | outtype = sqldict['date'] 269 | outs.append(out_str.format(name, outtype)) 270 | 271 | # Insert an index line 272 | if self._idx: 273 | outs.insert(0, sqldict['index']) 274 | 275 | # Write the column information 276 | sql.write(',\n'.join(outs)) 277 | sql.write(');\n') 278 | # Write the dialect-specific table generation command 279 | sql.write(sqldict['end'].format(table=table, csvname=csvname)) 280 | sql.close() 281 | 282 | def to_dataframe(self, chunksize=None, na='nan'): 283 | '''Return the DBF contents as a DataFrame. 284 | 285 | Parameters 286 | ---------- 287 | chunksize : int, optional 288 | Maximum number of records to process at any given time. If 'None' 289 | (defalut), process all records. 290 | 291 | na : various types accepted, optional 292 | The value that will be used to replace missing or malformed 293 | entries. Right now this accepts pretty much anything, and that 294 | value will be used as a replacement. (May not do what you expect.) 295 | However, the strings 'na' or 'nan' (case insensitive) will insert 296 | float('nan'), the string 'none' (case insensitive) or will insert 297 | the Python object `None`. Default for DataFrame is NaN ('nan'); 298 | however, float/int columns are always float('nan') 299 | 300 | Returns 301 | ------- 302 | DataFrame (chunksize == None) 303 | The DBF file contents as a Pandas DataFrame 304 | 305 | Generator (chunksize != None) 306 | This generator returns DataFrames with the maximum number of 307 | records equal to chunksize. (May be less) 308 | 309 | Notes 310 | ----- 311 | This method requires Pandas >= 0.15.2. 312 | ''' 313 | self._na_set(na) 314 | if not chunksize: 315 | # _get_recs is a generator, convert to list for DataFrame 316 | results = list(self._get_recs()) 317 | df = pd.DataFrame(results, columns=self.columns) 318 | del(results) # Free up the memory? If GC works properly 319 | return df 320 | else: 321 | # Return a generator function instead 322 | return self._df_chunks(chunksize) 323 | 324 | def _df_chunks(self, chunksize): 325 | '''A DataFrame chunk generator. 326 | 327 | See `to_dataframe`. 328 | ''' 329 | chunks = self._chunker(chunksize) 330 | # Keep track of the index, otherwise every DataFrame will be indexed 331 | # starting at 0 332 | idx = 0 333 | for chunk in chunks: 334 | results = list(self._get_recs(chunk=chunk)) 335 | num = len(results) # Avoids skipped records problem 336 | df = pd.DataFrame(results, columns=self.columns, 337 | index=range(idx, idx+num)) 338 | idx += num 339 | del(results) 340 | yield df 341 | 342 | def to_pandassql(self, engine, table=None, chunksize=None, na='nan'): 343 | '''Write DBF contents to an SQL database using Pandas. 344 | 345 | Parameters 346 | ---------- 347 | engine : SQLAlchemy Engine or string 348 | A SQLalchemy Engine instance or an SQL initialization string. See 349 | the SQL engine dialect documentation for more information. 350 | 351 | table : string, optional 352 | The name of the table to create for the DBF records. If 'None' 353 | (default), the DBF contents will be saved into a table with the 354 | same name as the input file without the file extension.The default 355 | behavior appends new data to an existing table. Delete the table 356 | by hand before running this method if you don't want the old data. 357 | 358 | chunksize : int, optional 359 | Maximum number of records to process at any given time. If 'None' 360 | (default), process all records. 361 | 362 | na : various types accepted, optional 363 | The value that will be used to replace missing or malformed 364 | entries. Right now this accepts pretty much anything, and that 365 | value will be used as a replacement. (May not do what you expect.) 366 | However, the strings 'na' or 'nan' (case insensitive) will insert 367 | float('nan'), the string 'none' (case insensitive) or will insert 368 | the Python object `None`. Default for SQL table is NaN ('nan'); 369 | however, float/int columns are always float('nan'). 370 | 371 | Notes 372 | ----- 373 | This method requires Pandas >= 0.15.2 and SQLalchemy >= 0.9.7. 374 | ''' 375 | self._na_set(na) 376 | if not table: 377 | table = self.dbf[:-4] # strip trailing ".dbf" 378 | 379 | if isinstance(engine, str): 380 | engine_inst = sql.create_engine(engine) 381 | elif isinstance(engine, sql.engine.Engine): 382 | engine_inst = engine 383 | else: 384 | error = 'The engine argument is not a string or SQLAlchemy' +\ 385 | 'engine.' 386 | raise ValueError(error) 387 | 388 | # Setup string types for proper length, otherwise Pandas assumes 389 | # "Text" types, which may not be as efficient 390 | dtype = {} 391 | for field in self.fields: 392 | if field[1] == 'C': 393 | # Right now, Pandas doesn't support string length 394 | # Should work fine for sqlite and postgresql 395 | dtype[field[0]] = sql.types.String#(field[2]) 396 | 397 | # The default behavior is to append new data to existing tables. 398 | if not chunksize: 399 | df = self.to_dataframe() 400 | df.to_sql(table, engine_inst, dtype=dtype, if_exists='append') 401 | else: 402 | for df in self.to_dataframe(chunksize=chunksize): 403 | df.to_sql(table, engine_inst, dtype=dtype, if_exists='append') 404 | del(df) 405 | 406 | 407 | def to_pandashdf(self, h5name, table=None, chunksize=None, na='nan', 408 | complevel=9, complib='blosc', data_columns=None): 409 | '''Write DBF contents to an HDF5 file using Pandas. 410 | 411 | Parameters 412 | ---------- 413 | h5name : string 414 | The name of HDF file to use. By default, this file is opened in 415 | 'append' mode so that any existing files will not be overwritten, 416 | but it may cause problems. 417 | 418 | table : string, optional 419 | The name of the table to create for the DBF records. If 'None' 420 | (default), the DBF contents will be saved into a table with the 421 | same name as the input file without the file extension.The default 422 | behavior appends new data to an existing table. Delete the table 423 | by hand before running this method if you don't want the old data. 424 | 425 | chunksize : int, optional 426 | Maximum number of records to process at any given time. If 'None' 427 | (default), process all records. 428 | 429 | na : various types accepted, optional 430 | The value that will be used to replace missing or malformed 431 | entries. Right now this accepts pretty much anything, and that 432 | value will be used as a replacement. (May not do what you expect.) 433 | However, the strings 'na' or 'nan' (case insensitive) will insert 434 | float('nan'), the string 'none' (case insensitive) or will insert 435 | the Python object `None`. Default for HDF table is NaN ('nan'); 436 | however, float/int columns are always float('nan'). 437 | 438 | complib/complevel : int/string 439 | These keyword arguments set the compression library and level for 440 | the HDF file. These arguments are identical to the one defined for 441 | Pandas HDFStore, so see the Pandas documentation on `HDFStore` for 442 | more information. 443 | 444 | data_columns : list of column names or True 445 | This is a list of column names that will be created as data 446 | columns in the HDF file. This allows for advanced searching on 447 | these columns. If `True` is passed all columns will be data 448 | columns. There is some performace/file size degredation using this 449 | method, so for large numbers of columns, it is not recomended. See 450 | the Pandas IO documentation for more information. 451 | 452 | Notes 453 | ----- 454 | This method requires Pandas >= 0.15.2 and PyTables >= 3.1.1. 455 | 456 | The default here is to create a compressed HDF5 file using the 'blosc' 457 | compression library (compression level = 9). This shouldn't affect 458 | performance much, but it does save an enormous amount of disk space. 459 | ''' 460 | self._na_set(na) 461 | if not table: 462 | table = self.dbf[:-4] # strip trailing ".dbf" 463 | 464 | h5 = pd.HDFStore(h5name, 'a', complevel=complevel, complib=complib) 465 | 466 | if not chunksize: 467 | df = self.to_dataframe() 468 | h5.append(table, df, data_columns=data_columns) 469 | else: 470 | # Find the maximum string column length This is necessary because 471 | # the appendable table can not change width if a new DF is added 472 | # with a longer string 473 | max_string_len = {} 474 | mx = 0 475 | for field in self.fields: 476 | if field[1] == "C" and field[2] > mx: 477 | mx = field[2] 478 | if mx != 0: 479 | max_string_len = {'values':mx} 480 | 481 | for df in self.to_dataframe(chunksize=chunksize): 482 | h5.append(table, df, min_itemsize=max_string_len, 483 | data_columns=data_columns) 484 | h5.flush(fsync=True) 485 | 486 | del(df) 487 | h5.close() 488 | 489 | class Dbf5(DbfBase): 490 | ''' 491 | DBF version 5 file processing object. 492 | 493 | This class defines the methods necessary for reading the header and 494 | records from a version 5 DBF file. Much of this code is based on an 495 | `ActiveState DBF example`_, which only worked for Python2. 496 | 497 | .. ActiveState DBF example: http://code.activestate.com/recipes/ 498 | 362715-dbf-reader-and-writer/ 499 | 500 | Parameters 501 | ---------- 502 | 503 | dbf : string 504 | The name (with optional path) of the DBF file. 505 | 506 | codec : string, optional 507 | The codec to use when decoding text-based records. The default is 508 | 'utf-8'. See Python's `codec` standard lib module for other options. 509 | 510 | Attributes 511 | ---------- 512 | 513 | dbf : string 514 | The input file name. 515 | 516 | f : file object 517 | The opened DBF file object 518 | 519 | numrec : int 520 | The number of records contained in this file. 521 | 522 | lenheader : int 523 | The length of the file header in bytes. 524 | 525 | numfields : int 526 | The number of data columns. 527 | 528 | fields : list of tuples 529 | Column descriptions as a tuple: (Name, Type, # of bytes). 530 | 531 | columns : list 532 | The names of the data columns. 533 | 534 | fmt : string 535 | The format string that is used to unpack each record from the file. 536 | 537 | fmtsiz : int 538 | The size of each record in bytes. 539 | ''' 540 | def __init__(self, dbf, codec='utf-8'): 541 | self._enc = codec 542 | path, name = os.path.split(dbf) 543 | self.dbf = name 544 | # Escape quotes, set by indiviual runners 545 | self._esc = None 546 | # Reading as binary so bytes will always be returned 547 | self.f = open(dbf, 'rb') 548 | 549 | self.numrec, self.lenheader = struct.unpack('Pandas conversion 630 | # Otherwise floats were not showing up correctly 631 | value = float('nan') 632 | 633 | # Date stores as string "YYYYMMDD", convert to datetime 634 | elif typ == 'D': 635 | try: 636 | y, m, d = int(value[:4]), int(value[4:6]), \ 637 | int(value[6:8]) 638 | if name not in self._dtypes: 639 | self._dtypes[name] = "date" 640 | except: 641 | value = self._na 642 | else: 643 | value = datetime.date(y, m, d) 644 | 645 | # Booleans can have multiple entry values 646 | elif typ == 'L': 647 | if name not in self._dtypes: 648 | self._dtypes[name] = "bool" 649 | if value in b'TyTt': 650 | value = True 651 | elif value in b'NnFf': 652 | value = False 653 | # '?' indicates an empty value, convert this to NaN 654 | else: 655 | value = self._na 656 | 657 | # Floating points are also stored as strings. 658 | elif typ == 'F': 659 | if name not in self._dtypes: 660 | self._dtypes[name] = "float" 661 | try: 662 | value = float(value) 663 | except: 664 | value = float('nan') 665 | 666 | else: 667 | err = 'Column type "{}" not yet supported.' 668 | raise ValueError(err.format(value)) 669 | 670 | result.append(value) 671 | yield result 672 | 673 | --------------------------------------------------------------------------------