├── .gitignore
├── simpledbf
    ├── __init__.py
    └── simpledbf.py
├── setup.py
├── LICENSE.txt
├── RELEASENOTES.rst
└── README.rst


/.gitignore:
--------------------------------------------------------------------------------
1 | *swp
2 | TODO*
3 | *pyc
4 | .pypirc
5 | dist/
6 | simpledbf.egg*
7 | 


--------------------------------------------------------------------------------
/simpledbf/__init__.py:
--------------------------------------------------------------------------------
1 | from .simpledbf import Dbf5
2 | 
3 | __all__ = ['Dbf5',]
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open('README.rst') as file:
 4 |     long_description = file.read()
 5 | 
 6 | setup(
 7 |     name = "simpledbf",
 8 |     version = "0.2.6",
 9 | 
10 |     description = "Convert DBF files to CSV, DataFrames, HDF5 tables, and "\
11 |             "SQL tables. Python3 compatible.",
12 |     url = "https://github.com/rnelsonchem/simpledbf",
13 |     long_description = long_description,
14 | 
15 |     author = "Ryan Nelson",
16 |     author_email = "rnelsonchem@gmail.com",
17 | 
18 |     license = "BSD",
19 |     classifiers = [
20 |         'Development Status :: 4 - Beta',
21 |         'Intended Audience :: Developers', 
22 |         'Intended Audience :: Science/Research',
23 |         'License :: OSI Approved :: BSD License',
24 |         'Programming Language :: Python :: 2',
25 |         'Programming Language :: Python :: 2.7',
26 |         'Programming Language :: Python :: 3',
27 |         'Programming Language :: Python :: 3.4',
28 |     ],
29 | 
30 |     keywords = "DBF CSV Pandas SQLalchemy PyTables DataFrame SQL HDF",
31 | 
32 |     packages = find_packages(),
33 | 
34 | )
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014, Ryan Nelson.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are
 6 | met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |        notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |        copyright notice, this list of conditions and the following
13 |        disclaimer in the documentation and/or other materials provided
14 |        with the distribution.
15 | 
16 |     * Neither the name of the NumPy Developers nor the names of any
17 |        contributors may be used to endorse or promote products derived
18 |        from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/RELEASENOTES.rst:
--------------------------------------------------------------------------------
  1 | simpledbf 0.2.6 Release Notes
  2 | #############################
  3 | 
  4 | Bug Fixes
  5 | ---------
  6 | 
  7 | * Fixed the pure-CSV output function. Fix contributed by @sarasafavi.
  8 | 
  9 | simpledbf 0.2.4 Release Notes
 10 | #############################
 11 | 
 12 | Highlights
 13 | ----------
 14 | 
 15 | * Added `data_columns`, `complib`, and `complevel` keyword arguments to the
 16 |   `to_pandashdf` method. The function of these mirrors the equivalent
 17 |   arguments used in the Pandas HDFStore object and HDFStore.append method. See
 18 |   the Pandas docs for more information.
 19 | 
 20 | Bug Fixes
 21 | ---------
 22 | 
 23 | * Added an index column to the CSV-based SQL output. This is necessary for
 24 |   selecting data from the SQL file, and also reading the SQL table back into a
 25 |   DataFrame at a later time.
 26 | 
 27 | simpledbf 0.2.3 Release Notes
 28 | #############################
 29 | 
 30 | Highlights
 31 | ----------
 32 | 
 33 | * Added a pure-Python SQL option which writes a SQL table creation file and a
 34 |   header-less CSV file. This is much more efficient for uploading large data
 35 |   files to SQL.
 36 | 
 37 | API Changes
 38 | -----------
 39 | 
 40 | * The default NaN value for float/int columns was changed to always be
 41 |   ``float('nan')``. This is necessary for DBF->SQL->DF conversion, even though
 42 |   the CSV files now have 'nan' for all empty values.
 43 | 
 44 | simpledbf 0.2.2 Release Notes
 45 | #############################
 46 | 
 47 | Highlights
 48 | ----------
 49 | 
 50 | * Added an optional 'codec' keyword argument to Dbf5 __init__, which controls
 51 |   the decoding of the values in the DBF file and CSV output file. Default is
 52 |   'utf-8'
 53 | 
 54 | * Made a couple small algorithmic changes that improved performance.
 55 | 
 56 | Bug Fixes
 57 | ---------
 58 | 
 59 | * The 'na' flag now works properly. (In previous versions, it was always
 60 |   setting empty values to the string 'nan').
 61 | 
 62 | * Properly set the string column width for HDF chunksize-only output. (The
 63 |   column width is set to max(string len) by default, which may not be the
 64 |   largest for every chunk. Used the dbf header info to fix this.)
 65 | 
 66 | simpledbf 0.2.1 Release Notes
 67 | #############################
 68 | 
 69 | Highlights
 70 | ----------
 71 | 
 72 | * Added a 'na' keyword argument that controls the value of missing/bad data.
 73 | 
 74 | * Set the default 'na' to the empty string ('') for CSV and NaN ('nan') for
 75 |   all others exports.
 76 | 
 77 | simpledbf 0.2.0 Release Notes
 78 | #############################
 79 | 
 80 | Functionality stays the same, but a few implementation details have changed.
 81 | Tested with Python2, and everything except HDF export works fine.
 82 | 
 83 | Highlights
 84 | ----------
 85 | 
 86 | * Empty strings are converted to Nan (ie `float('nan')`).
 87 |   
 88 | * Added try/except clauses to all other types, so poorly formatted values
 89 |   will be returned as NaN as well. This may not be the behavior that is
 90 |   expected, so be careful.
 91 | 
 92 | simpledbf 0.1.0 Release Notes
 93 | #############################
 94 | 
 95 | First release.
 96 | 
 97 | Highlights
 98 | ----------
 99 | 
100 | * Pure-Python3 read of DBF files
101 | 
102 | * Pure-Python3 write as CSV
103 | 
104 | * Convert to DataFrame (Pandas required)
105 | 
106 | * Convert to HDF5 table (Pandas and PyTables required)
107 | 
108 | * Convert to SQL table (Pandas and SQLalchemy required)
109 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | simpledbf
  2 | #########
  3 | 
  4 | *simpledbf* is a Python library for converting basic DBF files (see
  5 | `Limitations`_) to CSV files, Pandas DataFrames, SQL tables, or HDF5 tables.
  6 | This package is fully compatible with Python >=3.4, with almost complete
  7 | `Python 2.7 support`_ as well. The conversion to CSV and SQL (see
  8 | ``to_textsql`` below) is entirely written in Python, so no additional
  9 | dependencies are necessary. For other export formats, see `Optional
 10 | Requirements`_.  This code was designed to be very simple, fast and memory
 11 | efficient for convenient interactive or batch file processing; therefore, it
 12 | lacks many features, such as the ability to write DBF files, that other
 13 | packages might provide. 
 14 | 
 15 | Bug fixes, questions, and update requests are encouraged and can be filed at
 16 | the `GitHub repo`_. 
 17 | 
 18 | This code is derived from an  `ActiveState DBF example`_ that works with
 19 | Python2 and is distributed under a PSF license.
 20 | 
 21 | 
 22 | .. _Optional Requirements:
 23 | 
 24 | Optional Requirements
 25 | ---------------------
 26 | 
 27 | * Pandas >= 0.15.2 (Required for DataFrame)
 28 | 
 29 | * PyTables >= 3.1 (with Pandas required for HDF tables)
 30 | 
 31 | * SQLalchemy >= 0.9 (with Pandas required for DataFrame-SQL tables)
 32 | 
 33 | Installation
 34 | ------------
 35 | 
 36 | The most recent release of *simpledbf* can be installed using ``pip`` or
 37 | ``conda``, if you happen to be using the `Anaconda Python distribution`_.
 38 | 
 39 | Using ``conda``::
 40 | 
 41 |     $ conda install -c https://conda.binstar.org/rnelsonchem simpledbf
 42 | 
 43 | Using ``pip``::
 44 | 
 45 |     $ pip install simpledbf
 46 | 
 47 | The development version can be installed from GitHub::
 48 | 
 49 |     $ pip install git+https://github.com/rnelsonchem/simpledbf.git
 50 | 
 51 | As an alternative, this package only contains a single file, so in principle,
 52 | you could download the ``simpledbf.py`` file from Github and put it in any
 53 | folder of your choosing.
 54 | 
 55 | 
 56 | .. _Limitations:
 57 | 
 58 | DBF File Limitations
 59 | --------------------
 60 | 
 61 | This package currently supports a subset of `dBase III through 5`_ DBF files.
 62 | In particular, support is missing for linked memo (i.e. DBT) files. This is
 63 | mostly due to limitations in the types of files available to the author.  Feel
 64 | free to request an update if you can supply a DBF file with an associated memo
 65 | file. `DBF version 7`_, the most recent DBF file spec, is not currently
 66 | supported by this package.
 67 | 
 68 | 
 69 | .. _Python 2.7 support:
 70 | 
 71 | Python 2 Support 
 72 | ----------------
 73 | 
 74 | Except for HDF file export, this code should work fine with Python >=2.7.
 75 | However, HDF files created in Python3 are compatible with all Python2 HDF
 76 | packages, so in principle, you could make any HDF files in a temporary Python3
 77 | environment. If you are using the `Anaconda Python distribution`_
 78 | (recommended), then you can make a small Python3 working environment as
 79 | follows:
 80 | 
 81 | .. code::
 82 | 
 83 |     $ conda create -n dbf python=3 pandas pytables sqlalchemy
 84 |     # Lots of output...
 85 |     
 86 |     $ source activate dbf
 87 | 
 88 |     dbf>$ conda install -c https://conda.binstar.org/rnelsonchem simpledbf
 89 | 
 90 |     dbf>$ python my_py3_hdf_creation_script.py
 91 |     # This is using Python3
 92 | 
 93 |     dbf>$ source deactivate
 94 | 
 95 |     $ python my_py2_stuff_with_hdf.py
 96 |     # This is using Python2 again
 97 | 
 98 | HDF file export is currently broken in Python2 due to a `limitation in Pandas
 99 | HDF export with unicode`_. This issue may be fixed future versions of
100 | Pandas/PyTables.
101 | 
102 | 
103 | Example Usage
104 | #############
105 | 
106 | .. _Loading:
107 | 
108 | Load a DBF file
109 | ---------------
110 | 
111 | This module currently only defines a single class, ``Dbf5``, which is
112 | instantiated with a DBF file name, which can contain path info as well. An
113 | optional 'codec' keyword argument that controls the codec used for
114 | reading/writing files. The default is 'utf-8'. See the documentation for
115 | Python's `codec standard library module`_ for more codec options.
116 | 
117 | .. code::
118 | 
119 |     In : from simpledbf import Dbf5
120 | 
121 |     In : dbf = Dbf5('fake_file_name.dbf', codec='utf-8')
122 | 
123 | The ``Dbf5`` object initially only reads the header information from the file,
124 | so you can inspect some of the properties. For example, ``numrec`` is the
125 | number of records in the DBF file, and ``fields`` is a list of tuples with
126 | information about the data columns. See the DBF file spec for info on the
127 | column type characters. The "DeletionFlag" column is always present as a check
128 | for deleted records; however, it is never exported during conversion.
129 | 
130 | .. code::
131 | 
132 |     In : dbf.numrec
133 |     Out: 10000
134 | 
135 |     In : dbf.fields
136 |     Out: [('DeletionFlag', 'C', 1), ('col_1', 'C', 15), ('col_2', 'N', 2)]
137 | 
138 | The docstring for this object contains a complete listing of attributes and
139 | their descriptions.
140 | 
141 | The ``mem`` method gives an approximate memory requirement for processing this
142 | DBF file. (~2x the total file size, which could be wildly inaccurate.) In
143 | addition, all of the output methods in this object take a ``chunksize``
144 | keyword argument, which lets you split up the processing of large files into
145 | smaller chunks to limit the total memory usage of the conversion process. When
146 | this keyword argument is passed into ``mem``, the approximate memory footprint
147 | of the chunk will also be given, which can be useful when trying to determine
148 | the maximum chunksize your memory will allow.
149 | 
150 | .. code::
151 | 
152 |     In : dbf.mem()
153 |     This total process would require more than 350.2 MB of RAM. 
154 | 
155 |     In : dbf.mem(chunksize=1000)
156 |     Each chunk will require 4.793 MB of RAM.
157 |     This total process would require more than 350.2 MB of RAM.
158 | 
159 | 
160 | Export the Data
161 | ---------------
162 | 
163 | The ``Ddb5`` object behaves like Python's file object in that it will be
164 | "exhausted" after export. To re-export the DBF data to a different format,
165 | first create a new ``Dbf5`` instance using the same file name. This procedure
166 | is followed in the documentation below.
167 | 
168 |     
169 | Note on Empty/Bad Data
170 | ++++++++++++++++++++++
171 | 
172 | This package attempts to convert most blank strings and poorly formatted
173 | values to an empty value of your choosing. This is controlled by the ``na``
174 | keyword argument to all export functions. The default for CSV is an empty
175 | string (''), and for all other exports, it is 'nan' which converts empty/bad
176 | values to ``float('nan')``. *NOTE* The exception here is that float/int
177 | columns always use ``float('nan')`` for all missing values for
178 | DBF->SQL->DataFrame conversion purposes. Pandas has very powerful functions
179 | for `working with missing data`_, including converting NaN to other values
180 | (e.g.  empty strings). 
181 | 
182 |         
183 | To CSV
184 | ++++++
185 | 
186 | Use the ``to_csv`` method to export the data to a CSV file. This method
187 | requires the name of a CSV file as an input. The default behavior is to append
188 | new data to an existing file, so be careful if the file already exists. The
189 | ``chunksize`` keyword argument controls the frequency that  the file buffer
190 | will be flushed, which may not be necessary. The ``na`` keyword changes the
191 | value used for missing/bad entries (default is ''). The keyword ``header`` is
192 | a boolean that controls writing of the column names as the first row of the
193 | CSV file. The encoding of the resulting CSV file is determined by the codec
194 | that is set when opening the DBF file, see `Loading`_. 
195 | 
196 | .. code::
197 | 
198 |     In : dbf = Dbf5('fake_file_name.dbf')
199 | 
200 |     In : dbf.to_csv('junk.csv')
201 | 
202 | If you are unhappy with the default CSV output of this module, Pandas also has
203 | very `powerful CSV export capabilities`_ for DataFrames.
204 | 
205 | 
206 | To SQL (CSV-based)
207 | ++++++++++++++++++
208 | 
209 | Most SQL databases can create tables directly from local CSV files. The
210 | pure-Python ``to_textsql`` method creates two files: 1) a header-less CSV file
211 | containing the DBF contents, and 2) a SQL file containing the appropriate
212 | table creation and CSV import code. It is up to you to run the SQL file as a
213 | separate step. This function takes two mandatory arguments, which are simply
214 | the names of the SQL and CSV files, respectively. In addition, there are a
215 | number of optional keyword arguments as well. ``sqltype`` controls the output
216 | dialect. The default is 'sqlite', but 'postgres' is also accepted.  ``table``
217 | sets the name of the SQL table that will be created. By default, this will be
218 | the name of the DBF file without the file extension. You should escape quote
219 | characters (") in the CSV file. This is controlled with the ``escapeqoute``
220 | keyword, which defaults to ``'"'``. (This changes '"' in text strings to '""',
221 | which the SQL server should ignore.) The ``chunksize``, ``na``, and ``header``
222 | keywords are used to control the CSV file. See above.
223 | 
224 | Here's an example for SQLite:
225 | 
226 | .. code::
227 | 
228 |     In : dbf = Dbf5('fake_file_name.dbf')
229 | 
230 |     In : dbf.to_textsql('junk.sql', 'junk.csv')
231 | 
232 |     # Exit Python
233 |     $ sqlite3 junk.db < junk.sql
234 | 
235 | Here's an example for Postgresql:
236 | 
237 | .. code::
238 | 
239 |     In : dbf = Dbf5('fake_file_name.dbf')
240 | 
241 |     In : dbf.to_textsql('junk.sql', 'junk.csv', sqltype='postgres')
242 | 
243 |     # Exit Python
244 |     $ psql -U username -f junk.sql db_name
245 | 
246 | To DataFrame 
247 | ++++++++++++
248 | 
249 | The ``to_dataframe`` method returns the DBF records as a Pandas DataFrame.  If
250 | the size of the DBF file exceeds available memory, then passing the
251 | ``chunksize`` keyword argument will return a generator function. This
252 | generator yields DataFrames of len(<=chunksize) until all of the records have
253 | been processed. The ``na`` keyword changes the value used for missing/bad
254 | entries (default is 'nan' which inserts ``float('nan')``).
255 | 
256 | .. code::
257 | 
258 |     In : dbf = Dbf5('fake_file_name.dbf')
259 | 
260 |     In : df = dbf.to_dataframe()
261 |     # df is a DataFrame with all records
262 | 
263 |     In : dbf = Dbf5('fake_file_name.dbf')
264 | 
265 |     In : for df in dbf.to_dataframe(chunksize=10000)
266 |     ....     do_cool_stuff(df)
267 |     # Here a generator is returned
268 | 
269 | .. _chunksize issue:
270 | 
271 | Issue with DataFrame Chunksize
272 | ++++++++++++++++++++++++++++++
273 | 
274 | When a DataFrame is constructed, it attempts to determine the dtype of each
275 | column. If you chunk the DataFrame output, it turns out that the dtype for a
276 | column can change. For example, if one chunk has a column with all strings,
277 | the dtype will be ``np.object``; however, if in the next chunk that same
278 | column is full of ``float('nan')``, the resulting dtype will be set as
279 | ``float``. This has some consequences for writing to SQL and HDF tables as
280 | well. In principle, this behavior could be changed, but it is currently
281 | non-trivial to set the dtypes for DataFrame columns on construction. Please
282 | file a PR through GitHub if this is a big problem.
283 | 
284 | 
285 | To an SQL Table using Pandas
286 | ++++++++++++++++++++++++++++
287 | 
288 | The ``to_pandassql`` method will transfer the DBF entries to an SQL database
289 | table of your choice using a combination of Pandas DataFrames and SQLalchemy.
290 | A valid `SQLalchemy engine string`_ argument is required to connect with the
291 | database. Database support will be limited to those supported by SQLalchemy.
292 | (This has been tested with SQLite and Postgresql.) Note, if you are
293 | transferring a large amount of data, this method will be very slow. If you
294 | have direct access to the SQL server, you might want to use the text-based SQL
295 | export instead.
296 | 
297 | .. code::
298 | 
299 |     In : dbf = Dbf5('fake_file_name.dbf')
300 | 
301 |     In : dbf.to_pandassql('sqlite:///foo.db')
302 | 
303 | This method accepts three optional arguments. ``table`` is the name of the
304 | table you'd like to use. If this is not passed, your new table will have the
305 | same name as the DBF file without file extension. Again, the default here is
306 | to append to an existing table. If you want to start fresh, delete the
307 | existing table before using this function. The ``chunksize`` keyword processes
308 | the DBF file in chunks of records no larger than this size. The ``na`` keyword
309 | changes the value used for missing/bad entries (default is 'nan' which inserts
310 | ``float('nan')``).
311 | 
312 | .. code::
313 | 
314 |     In : dbf = Dbf5('fake_file_name.dbf')
315 | 
316 |     In : dbf.to_pandassql('sqlite:///foo.db', table="fake_tbl",
317 |     ....                    chunksize=100000)
318 |     
319 | 
320 | To an HDF5 Table
321 | ++++++++++++++++
322 | 
323 | The ``to_pandashdf`` method transfers the DBF entries to an HDF5 table of your
324 | choice. This method uses a combination of Pandas DataFrames and PyTables, so
325 | both of these packages must be installed. This method requires a file name
326 | string for the HDF file, which will be created if it does not exist.  Again,
327 | the default behavior is to append to an existing file of that name, so be
328 | careful here.  The HDF file will be created using the highest level of
329 | compression (9) with the 'blosc' compression lib. This saves an enormous
330 | amount of disk space, with little degradation of performance; however, this
331 | compression library is non-standard, which can cause problems with other HDF
332 | libraries. Compression options are controlled use the ``complib`` and
333 | ``complevel`` keyword arguments, which are identical to the ones described in
334 | the `Pandas HDF compression docs`_.
335 | 
336 | .. code::
337 | 
338 |     In : dbf = Dbf5('fake_file_name.dbf')
339 | 
340 |     In : dbf.to_pandashdf('fake.h5')
341 | 
342 | This method uses the same optional arguments, and corresponding defaults, as
343 | ``to_pandassql`` (see above). A example with ``chunksize`` is shown below. In
344 | addition, a ``data_columns`` keyword argument is also available, which sets
345 | the columns that will be used as data columns in the HDF table. Data columns
346 | can be used for advanced searching and selection; however, there is some
347 | degredation of preformance for large numbers of data columns. See the `Pandas
348 | data columns docs`_ for a more detailed explanation.
349 | 
350 | .. code::
351 | 
352 |     In : dbf = Dbf5('fake_file_name.dbf')
353 | 
354 |     In : dbf.to_pandashdf('fake.h5', table="fake_tbl", chunksize=100000)
355 | 
356 | See the `chunksize issue`_ for DataFrame export for information on a potential
357 | problem you may encounter with chunksize.
358 | 
359 | 
360 | Batch Export
361 | ++++++++++++
362 | 
363 | Batch file export is trivial using *simpledbf*. For example, the following
364 | code processes all DBF files in the current directory into separate tables in
365 | a single HDF file.
366 | 
367 | .. code:: 
368 | 
369 |     In : import os
370 | 
371 |     In : from simpledbf import Dbf5
372 | 
373 |     In : files = os.listdir('.')
374 | 
375 |     In : for f in files:
376 |     ....     if f[-3:].lower() == 'dbf':
377 |     ....         dbf = Dbf5(f)
378 |     ....         dbf.to_pandashdf('all_data.h5')
379 | 
380 |    
381 | .. External Hyperlinks
382 | 
383 | .. _ActiveState DBF example: http://code.activestate.com/recipes/
384 |         362715-dbf-reader-and-writer/
385 | .. _GitHub repo: https://github.com/rnelsonchem/simpledbf
386 | .. _dBase III through 5: http://ulisse.elettra.trieste.it/services/doc/
387 |         dbase/DBFstruct.htm
388 | .. _DBF version 7: http://www.dbase.com/KnowledgeBase/int/db7_file_fmt.htm
389 | .. _Anaconda Python distribution: http://continuum.io/downloads
390 | .. _limitation in Pandas HDF export with unicode: http://pandas.pydata.org/
391 |         pandas-docs/stable/io.html#datatypes
392 | .. _codec standard library module: https://docs.python.org/3.4/library/
393 |         codecs.html 
394 | .. _working with missing data: http://pandas.pydata.org/pandas-docs/stable/
395 |         missing_data.html
396 | .. _powerful CSV export capabilities: http://pandas.pydata.org/pandas-docs/
397 |         stable/io.html#writing-to-csv-format
398 | .. _SQLalchemy engine string: http://docs.sqlalchemy.org/en/rel_0_9/core/
399 |         engines.html
400 | .. _Pandas HDF compression docs: http://pandas.pydata.org/pandas-docs/stable/
401 |         io.html#compression
402 | .. _Pandas data columns docs: http://pandas.pydata.org/pandas-docs/stable/
403 |         io.html#query-via-data-columns
404 | 
405 | 
406 | 
407 | 


--------------------------------------------------------------------------------
/simpledbf/simpledbf.py:
--------------------------------------------------------------------------------
  1 | import struct
  2 | import datetime
  3 | import os
  4 | import codecs
  5 | 
  6 | # Check for optional dependencies.
  7 | try:
  8 |     import pandas as pd
  9 | except:
 10 |     print("Pandas is not installed. No support for DataFrames, HDF, or SQL.")
 11 | else:
 12 |     try:
 13 |         import tables as tb
 14 |     except:
 15 |         print("PyTables is not installed. No support for HDF output.")
 16 |     try:
 17 |         import sqlalchemy as sql
 18 |     except:
 19 |         print("SQLalchemy is not installed. No support for SQL output.")
 20 | 
 21 | sqltypes = {
 22 |         'sqlite': {'str':'TEXT', 'float':'REAL', 'int': 'INTEGER', 
 23 |             'date':'TEXT', 'bool':'INTEGER', 
 24 |             'end': '.mode csv {table}\n.import {csvname} {table}',
 25 |             'start': 'CREATE TABLE {} (\n',
 26 |             'index': '"index" INTEGER PRIMARY KEY ASC',
 27 |             },
 28 |         'postgres': {'str': 'text', 'float': 'double precision', 
 29 |             'int':'bigint', 'date':'date', 'bool':'boolean',
 30 |             'end': '''\copy "{table}" from '{csvname}' delimiter ',' csv''',
 31 |             'start': 'CREATE TABLE "{}" (\n',
 32 |             'index': '"index" INTEGER PRIMARY KEY',
 33 |             },
 34 |         }
 35 | 
 36 | class DbfBase(object):
 37 |     '''
 38 |     Base class for DBF file processing objects.
 39 | 
 40 |     Do not instantiate this class. This provides some of the common functions
 41 |     for other subclasses.
 42 |     '''
 43 |     def _chunker(self, chunksize):
 44 |         '''Return a list of chunk ints from given chunksize.
 45 | 
 46 |         Parameters
 47 |         ----------
 48 |         chunksize : int
 49 |             The maximum chunk size 
 50 | 
 51 |         Returns
 52 |         -------
 53 |         list of ints
 54 |             A list of chunks necessary to break up a given file. These will
 55 |             all be equal to `chunksize`, except for the last value, which is
 56 |             the remainder (<= `chunksize).
 57 |         '''
 58 |         num = self.numrec//chunksize
 59 |         # Chunksize bigger than numrec
 60 |         if num == 0:
 61 |             return [self.numrec,]
 62 |         else:
 63 |             chunks = [chunksize,]*num
 64 |             remain = self.numrec%chunksize
 65 |             if remain != 0:
 66 |                 chunks.append(remain) 
 67 |             return chunks
 68 | 
 69 |     def _na_set(self, na):
 70 |         '''Set the value used for missing/bad data.
 71 | 
 72 |         Parameters
 73 |         ----------
 74 |         na : various types accepted
 75 |             The value that will be used to replace missing or malformed
 76 |             entries. Right now this accepts pretty much anything, and that
 77 |             value will be used as a replacement. (May not do what you expect.)
 78 |             However, the strings 'na' or 'nan' (case insensitive) will insert
 79 |             float('nan'), the string 'none' (case insensitive) or will insert
 80 |             the Python object `None`.  Float/int columns are always
 81 |             float('nan') regardless of this setting.
 82 |         '''
 83 |         if na.lower() == 'none':
 84 |             self._na = None
 85 |         elif na.lower() in ('na', 'nan'):
 86 |             self._na = float('nan')
 87 |         else:
 88 |             self._na = na
 89 |         
 90 |     def mem(self, chunksize=None):
 91 |         '''Print the memory usage for processing the DBF File.
 92 | 
 93 |         Parameters
 94 |         ----------
 95 |         chunksize : int, optional
 96 |             The maximum chunk size that will be used to process this file.
 97 | 
 98 |         Notes
 99 |         -----
100 |         This method will print the maximum amount of RAM that will be
101 |         necessary to process and load the DBF file. (This is ~2x the file
102 |         size.) However, if the optional chunksize is passed, this function
103 |         will also print memory usage per chunk as well, which can be useful
104 |         for efficiently chunking and processing a file.
105 |         '''
106 |         if chunksize: 
107 |             if chunksize > self.numrec:
108 |                 print("Chunksize larger than number of recs.")
109 |                 print("Chunksize set to {:d}.".format(self.numrec))
110 |             else:
111 |                 smallmem = 2.*(self.fmtsiz*chunksize/1024**2)
112 |                 chkout = "Each chunk will require {:.4g} MB of RAM."
113 |                 print(chkout.format(smallmem))
114 |         memory = 2.*(self.fmtsiz*self.numrec/1024**2)
115 |         out = "This total process would require more than {:.4g} MB of RAM."
116 |         print(out.format(memory))      
117 | 
118 |     def to_csv(self, csvname, chunksize=None, na='', header=True):
119 |         '''Write DBF file contents to a CSV file.
120 | 
121 |         Parameters
122 |         ----------
123 |         csvname : string
124 |             The name of the CSV file that will be created. By default, the
125 |             file will be opened in 'append' mode. This won't delete an already
126 |             existing file, but it will add new data to the end. May not be
127 |             what you want.
128 | 
129 |         chunksize : int, optional
130 |             If this is set, the contents of the file buffer will be flushed
131 |             after processing this many records. May be useful for very large
132 |             files that exceed the available RAM.
133 | 
134 |         na : various types accepted, optional
135 |             The value that will be used to replace missing or malformed
136 |             entries. Right now this accepts pretty much anything, and that
137 |             value will be used as a replacement. (May not do what you expect.)
138 |             However, the strings 'na' or 'nan' (case insensitive) will insert
139 |             float('nan'), the string 'none' (case insensitive) or will insert
140 |             the Python object `None`. Default for CSV is an empty string ('');
141 |             however, float/int columns are always float('nan').
142 | 
143 |         header : boolean, optional
144 |             Write out a header line with the column names. Default is True. 
145 |         '''
146 |         self._na_set(na)
147 |         # set index column; this is only True when used with to_textsql()
148 |         self._idx = False
149 |         csv = codecs.open(csvname, 'a', encoding=self._enc)
150 |         if header:
151 |             column_line = ','.join(self.columns)
152 |             csv.write(column_line + '\n')
153 | 
154 |         # Build up a formatting string for output. 
155 |         outs = []
156 |         for field in self.fields:
157 |             if field[0] == "DeletionFlag":
158 |                 # Add an index column placeholder
159 |                 if self._idx:
160 |                     outs.append('{}')
161 |                 else:
162 |                     continue
163 |             # Wrap strings in quotes
164 |             elif field[1] in 'CDL':
165 |                 outs.append('"{}"')
166 |             elif field[1] in 'NF':
167 |                 outs.append('{}')
168 |         # Make the outline unicode or it won't write out properly for UTF-8
169 |         out_line = u','.join(outs) + '\n'
170 |         
171 |         count = 0
172 |         for n, result in enumerate(self._get_recs()):
173 |             if self._idx:
174 |                 out_string = out_line.format(n, *result)
175 |             else:
176 |                 out_string = out_line.format(*result)
177 |             
178 |             csv.write(out_string)
179 |             count += 1
180 |             if count == chunksize:
181 |                 csv.flush()
182 |                 count = 0
183 |         csv.close()
184 | 
185 |     def to_textsql(self, sqlname, csvname, sqltype='sqlite', table=None,
186 |             chunksize=None, na='', header=False, escapequote='"'):
187 |         '''Write a SQL input file along with a CSV File.
188 | 
189 |         This function generates a header-less CSV file along with an SQL input
190 |         file. The SQL file creates the database table and imports the CSV
191 |         data. This works sqlite and postgresql.
192 | 
193 |         Parameters
194 |         ----------
195 |         sqlname : str
196 |             Name of the SQL text file that will be created.
197 | 
198 |         csvname : str
199 |             Name of the CSV file to be generated. See `to_csv`.
200 | 
201 |         sqltype : str, optional
202 |             SQL dialect to use for SQL file. Default is 'sqlite'. Also accepts
203 |             'postgres' for Postgresql.
204 | 
205 |         table : str or None, optional
206 |             Table name to generate. If None (default), the table name will be
207 |             the name of the DBF input file without the file extension.
208 |             Otherwise, the given string will be used.
209 |         
210 |         chunksize : int, option
211 |             Number of chunks to process CSV creation. Defalut is None. See
212 |             `to_csv`.
213 | 
214 |         na : various types accepted, optional
215 |             Type to use for missing values. Default is ''. See `to_csv`.
216 | 
217 |         header : bool, optional
218 |             Write header to the CSV output file. Default is False. Some SQL
219 |             engines try to process a header line as data, which can be a
220 |             problem.
221 | 
222 |         escapequote : str, optional
223 |             Use this character to escape quotes (") in string columns. The
224 |             default is `'"'`. For sqlite and postgresql, a double quote
225 |             character in a text string is treated as a single quote. I.e. '""'
226 |             is converted to '"'.
227 |         '''
228 |         # Create an index column
229 |         self._idx = True
230 |         # Set the quote escape
231 |         self._esc = escapequote
232 |         # Get a dictionary of type conversions for a particular sql dialect
233 |         sqldict = sqltypes[sqltype]
234 |         # Create table name if not given
235 |         if not table:
236 |             table = self.dbf[:-4] # strip trailing ".dbf"
237 |         # Write the csv file
238 |         self.to_csv(csvname, chunksize=chunksize, na=na, header=header)
239 | 
240 |         # Write the header for the table creation.
241 |         sql = codecs.open(sqlname, 'w', encoding=self._enc)
242 |         head = sqldict['start']
243 |         sql.write(head.format(table))
244 | 
245 |         # Make an output string and container for all strings.
246 |         out_str = '"{}" {}'
247 |         outs = []
248 |         for field in self.fields:
249 |             name, typ, size = field
250 |             # Skip the first field
251 |             if name == "DeletionFlag":
252 |                 continue
253 | 
254 |             # Convert Python type to SQL type
255 |             if name in self._dtypes:
256 |                 dtype = self._dtypes[name]
257 |                 outtype = sqldict[dtype]
258 |             else: 
259 |                 # If the column does not have a type, probably all missing
260 |                 # Try out best to make it the correct type for self._na
261 |                 if typ == 'C':
262 |                     outtype = sqldict['str']
263 |                 elif typ in 'NF':
264 |                     outtype = sqldict['float']
265 |                 elif typ == 'L':
266 |                     outtype = sqldict['bool']
267 |                 elif typ == 'D':
268 |                     outtype = sqldict['date']
269 |             outs.append(out_str.format(name, outtype))
270 | 
271 |         # Insert an index line
272 |         if self._idx:
273 |             outs.insert(0, sqldict['index'])
274 | 
275 |         # Write the column information
276 |         sql.write(',\n'.join(outs))
277 |         sql.write(');\n')
278 |         # Write the dialect-specific table generation command
279 |         sql.write(sqldict['end'].format(table=table, csvname=csvname))
280 |         sql.close()
281 | 
282 |     def to_dataframe(self, chunksize=None, na='nan'):
283 |         '''Return the DBF contents as a DataFrame.
284 | 
285 |         Parameters
286 |         ----------
287 |         chunksize : int, optional
288 |             Maximum number of records to process at any given time. If 'None'
289 |             (defalut), process all records.
290 | 
291 |         na : various types accepted, optional
292 |             The value that will be used to replace missing or malformed
293 |             entries. Right now this accepts pretty much anything, and that
294 |             value will be used as a replacement. (May not do what you expect.)
295 |             However, the strings 'na' or 'nan' (case insensitive) will insert
296 |             float('nan'), the string 'none' (case insensitive) or will insert
297 |             the Python object `None`. Default for DataFrame is NaN ('nan');
298 |             however, float/int columns are always float('nan')
299 | 
300 |         Returns
301 |         -------
302 |         DataFrame (chunksize == None)
303 |             The DBF file contents as a Pandas DataFrame
304 | 
305 |         Generator (chunksize != None)
306 |             This generator returns DataFrames with the maximum number of
307 |             records equal to chunksize. (May be less)
308 | 
309 |         Notes
310 |         -----
311 |         This method requires Pandas >= 0.15.2.
312 |         '''
313 |         self._na_set(na)
314 |         if not chunksize:
315 |             # _get_recs is a generator, convert to list for DataFrame
316 |             results = list(self._get_recs())
317 |             df = pd.DataFrame(results, columns=self.columns)
318 |             del(results) # Free up the memory? If GC works properly
319 |             return df
320 |         else:
321 |             # Return a generator function instead
322 |             return self._df_chunks(chunksize)
323 | 
324 |     def _df_chunks(self, chunksize):
325 |         '''A DataFrame chunk generator.
326 | 
327 |         See `to_dataframe`.
328 |         '''
329 |         chunks = self._chunker(chunksize)
330 |         # Keep track of the index, otherwise every DataFrame will be indexed
331 |         # starting at 0
332 |         idx = 0
333 |         for chunk in chunks:
334 |             results = list(self._get_recs(chunk=chunk))
335 |             num = len(results) # Avoids skipped records problem
336 |             df = pd.DataFrame(results, columns=self.columns, 
337 |                               index=range(idx, idx+num))
338 |             idx += num
339 |             del(results) 
340 |             yield df
341 |     
342 |     def to_pandassql(self, engine, table=None, chunksize=None, na='nan'):
343 |         '''Write DBF contents to an SQL database using Pandas.
344 | 
345 |         Parameters
346 |         ----------
347 |         engine : SQLAlchemy Engine or string
348 |             A SQLalchemy Engine instance or an SQL initialization string. See
349 |             the SQL engine dialect documentation for more information.
350 | 
351 |         table : string, optional
352 |             The name of the table to create for the DBF records. If 'None'
353 |             (default), the DBF contents will be saved into a table with the
354 |             same name as the input file without the file extension.The default
355 |             behavior appends new data to an existing table. Delete the table
356 |             by hand before running this method if you don't want the old data.
357 | 
358 |         chunksize : int, optional
359 |             Maximum number of records to process at any given time. If 'None'
360 |             (default), process all records.
361 | 
362 |         na : various types accepted, optional
363 |             The value that will be used to replace missing or malformed
364 |             entries. Right now this accepts pretty much anything, and that
365 |             value will be used as a replacement. (May not do what you expect.)
366 |             However, the strings 'na' or 'nan' (case insensitive) will insert
367 |             float('nan'), the string 'none' (case insensitive) or will insert
368 |             the Python object `None`. Default for SQL table is NaN ('nan');
369 |             however, float/int columns are always float('nan').
370 | 
371 |         Notes
372 |         -----
373 |         This method requires Pandas >= 0.15.2 and SQLalchemy >= 0.9.7.
374 |         '''
375 |         self._na_set(na)
376 |         if not table:
377 |             table = self.dbf[:-4] # strip trailing ".dbf"
378 | 
379 |         if isinstance(engine, str): 
380 |             engine_inst = sql.create_engine(engine)
381 |         elif isinstance(engine, sql.engine.Engine):
382 |             engine_inst = engine
383 |         else:
384 |             error = 'The engine argument is not a string or SQLAlchemy' +\
385 |                     'engine.'
386 |             raise ValueError(error)
387 | 
388 |         # Setup string types for proper length, otherwise Pandas assumes
389 |         # "Text" types, which may not be as efficient
390 |         dtype = {}
391 |         for field in self.fields:
392 |             if field[1] == 'C':
393 |                 # Right now, Pandas doesn't support string length
394 |                 # Should work fine for sqlite and postgresql
395 |                 dtype[field[0]] = sql.types.String#(field[2])
396 |         
397 |         # The default behavior is to append new data to existing tables.
398 |         if not chunksize:
399 |             df = self.to_dataframe()
400 |             df.to_sql(table, engine_inst, dtype=dtype, if_exists='append')
401 |         else:
402 |             for df in self.to_dataframe(chunksize=chunksize):
403 |                 df.to_sql(table, engine_inst, dtype=dtype, if_exists='append')
404 |         del(df)
405 | 
406 |         
407 |     def to_pandashdf(self, h5name, table=None, chunksize=None, na='nan', 
408 |             complevel=9, complib='blosc', data_columns=None):
409 |         '''Write DBF contents to an HDF5 file using Pandas.
410 | 
411 |         Parameters
412 |         ----------
413 |         h5name : string
414 |             The name of HDF file to use. By default, this file is opened in
415 |             'append' mode so that any existing files will not be overwritten,
416 |             but it may cause problems.
417 | 
418 |         table : string, optional
419 |             The name of the table to create for the DBF records. If 'None'
420 |             (default), the DBF contents will be saved into a table with the
421 |             same name as the input file without the file extension.The default
422 |             behavior appends new data to an existing table. Delete the table
423 |             by hand before running this method if you don't want the old data.
424 | 
425 |         chunksize : int, optional
426 |             Maximum number of records to process at any given time. If 'None'
427 |             (default), process all records.
428 | 
429 |         na : various types accepted, optional
430 |             The value that will be used to replace missing or malformed
431 |             entries. Right now this accepts pretty much anything, and that
432 |             value will be used as a replacement. (May not do what you expect.)
433 |             However, the strings 'na' or 'nan' (case insensitive) will insert
434 |             float('nan'), the string 'none' (case insensitive) or will insert
435 |             the Python object `None`. Default for HDF table is NaN ('nan');
436 |             however, float/int columns are always float('nan').
437 | 
438 |         complib/complevel : int/string
439 |             These keyword arguments set the compression library and level for
440 |             the HDF file. These arguments are identical to the one defined for
441 |             Pandas HDFStore, so see the Pandas documentation on `HDFStore` for
442 |             more information.
443 | 
444 |         data_columns : list of column names or True
445 |             This is a list of column names that will be created as data
446 |             columns in the HDF file. This allows for advanced searching on
447 |             these columns. If `True` is passed all columns will be data
448 |             columns. There is some performace/file size degredation using this
449 |             method, so for large numbers of columns, it is not recomended. See
450 |             the Pandas IO documentation for more information.
451 | 
452 |         Notes
453 |         -----
454 |         This method requires Pandas >= 0.15.2 and PyTables >= 3.1.1.
455 | 
456 |         The default here is to create a compressed HDF5 file using the 'blosc'
457 |         compression library (compression level = 9). This shouldn't affect
458 |         performance much, but it does save an enormous amount of disk space.
459 |         '''
460 |         self._na_set(na)
461 |         if not table:
462 |             table = self.dbf[:-4] # strip trailing ".dbf"
463 | 
464 |         h5 = pd.HDFStore(h5name, 'a', complevel=complevel, complib=complib)
465 | 
466 |         if not chunksize:
467 |             df = self.to_dataframe()
468 |             h5.append(table, df, data_columns=data_columns)
469 |         else:
470 |             # Find the maximum string column length This is necessary because
471 |             # the appendable table can not change width if a new DF is added
472 |             # with a longer string
473 |             max_string_len = {}
474 |             mx = 0
475 |             for field in self.fields:
476 |                 if field[1] == "C" and field[2] > mx:
477 |                     mx = field[2]
478 |             if mx != 0:
479 |                 max_string_len = {'values':mx}
480 | 
481 |             for df in self.to_dataframe(chunksize=chunksize):
482 |                 h5.append(table, df, min_itemsize=max_string_len,
483 |                         data_columns=data_columns)
484 |                 h5.flush(fsync=True)
485 |         
486 |         del(df)
487 |         h5.close()
488 | 
489 | class Dbf5(DbfBase):
490 |     '''
491 |     DBF version 5 file processing object.
492 | 
493 |     This class defines the methods necessary for reading the header and
494 |     records from a version 5 DBF file.  Much of this code is based on an
495 |     `ActiveState DBF example`_, which only worked for Python2.
496 | 
497 |     .. ActiveState DBF example: http://code.activestate.com/recipes/
498 |             362715-dbf-reader-and-writer/
499 | 
500 |     Parameters
501 |     ----------
502 | 
503 |     dbf : string
504 |         The name (with optional path) of the DBF file.
505 | 
506 |     codec : string, optional
507 |         The codec to use when decoding text-based records. The default is
508 |         'utf-8'. See Python's `codec` standard lib module for other options.
509 | 
510 |     Attributes
511 |     ----------
512 | 
513 |     dbf : string
514 |         The input file name.
515 | 
516 |     f : file object
517 |         The opened DBF file object
518 | 
519 |     numrec : int
520 |         The number of records contained in this file.
521 |     
522 |     lenheader : int
523 |         The length of the file header in bytes.
524 | 
525 |     numfields : int
526 |         The number of data columns.
527 | 
528 |     fields : list of tuples
529 |         Column descriptions as a tuple: (Name, Type, # of bytes).
530 | 
531 |     columns : list
532 |         The names of the data columns.
533 | 
534 |     fmt : string
535 |         The format string that is used to unpack each record from the file.
536 | 
537 |     fmtsiz : int
538 |         The size of each record in bytes.
539 |     '''
540 |     def __init__(self, dbf, codec='utf-8'):
541 |         self._enc = codec
542 |         path, name = os.path.split(dbf)
543 |         self.dbf = name
544 |         # Escape quotes, set by indiviual runners
545 |         self._esc = None
546 |         # Reading as binary so bytes will always be returned
547 |         self.f = open(dbf, 'rb')
548 | 
549 |         self.numrec, self.lenheader = struct.unpack('<xxxxLH22x', 
550 |                 self.f.read(32))    
551 |         self.numfields = (self.lenheader - 33) // 32
552 | 
553 |         # The first field is always a one byte deletion flag
554 |         fields = [('DeletionFlag', 'C', 1),]
555 |         for fieldno in range(self.numfields):
556 |             name, typ, size = struct.unpack('<11sc4xB15x', self.f.read(32))
557 |             # eliminate NUL bytes from name string  
558 |             name = name.strip(b'\x00')        
559 |             fields.append((name.decode(self._enc), typ.decode(self._enc), size))
560 |         self.fields = fields
561 |         # Get the names only for DataFrame generation, skip delete flag
562 |         self.columns = [f[0] for f in self.fields[1:]]
563 |         
564 |         terminator = self.f.read(1)
565 |         assert terminator == b'\r'
566 |      
567 |         # Make a format string for extracting the data. In version 5 DBF, all
568 |         # fields are some sort of structured string
569 |         self.fmt = ''.join(['{:d}s'.format(fieldinfo[2]) for 
570 |                             fieldinfo in self.fields])
571 |         self.fmtsiz = struct.calcsize(self.fmt)
572 | 
573 |     def _get_recs(self, chunk=None):
574 |         '''Generator that returns individual records.
575 | 
576 |         Parameters
577 |         ----------
578 |         chunk : int, optional
579 |             Number of records to return as a single chunk. Default 'None',
580 |             which uses all records.
581 |         '''
582 |         if chunk == None:
583 |             chunk = self.numrec
584 | 
585 |         for i in range(chunk):
586 |             # Extract a single record
587 |             record = struct.unpack(self.fmt, self.f.read(self.fmtsiz))
588 |             # If delete byte is not a space, record was deleted so skip
589 |             if record[0] != b' ': 
590 |                 continue  
591 |             
592 |             # Save the column types for later
593 |             self._dtypes = {}
594 |             result = []
595 |             for idx, value in enumerate(record):
596 |                 name, typ, size = self.fields[idx]
597 |                 if name == 'DeletionFlag':
598 |                     continue
599 | 
600 |                 # String (character) types, remove excess white space
601 |                 if typ == "C":
602 |                     if name not in self._dtypes:
603 |                         self._dtypes[name] = "str"
604 |                     value = value.strip()
605 |                     # Convert empty strings to NaN
606 |                     if value == b'':
607 |                         value = self._na
608 |                     else:
609 |                         value = value.decode(self._enc)
610 |                         # Escape quoted characters
611 |                         if self._esc:
612 |                             value = value.replace('"', self._esc + '"')
613 | 
614 |                 # Numeric type. Stored as string
615 |                 elif typ == "N":
616 |                     # A decimal should indicate a float
617 |                     if b'.' in value:
618 |                         if name not in self._dtypes:
619 |                             self._dtypes[name] = "float"
620 |                         value = float(value)
621 |                     # No decimal, probably an integer, but if that fails,
622 |                     # probably NaN
623 |                     else:
624 |                         try:
625 |                             value = int(value)
626 |                             if name not in self._dtypes:
627 |                                 self._dtypes[name] = "int"
628 |                         except:
629 |                             # I changed this for SQL->Pandas conversion
630 |                             # Otherwise floats were not showing up correctly
631 |                             value = float('nan')
632 | 
633 |                 # Date stores as string "YYYYMMDD", convert to datetime
634 |                 elif typ == 'D':
635 |                     try:
636 |                         y, m, d = int(value[:4]), int(value[4:6]), \
637 |                                   int(value[6:8])
638 |                         if name not in self._dtypes:
639 |                             self._dtypes[name] = "date"
640 |                     except:
641 |                         value = self._na
642 |                     else:
643 |                         value = datetime.date(y, m, d)
644 | 
645 |                 # Booleans can have multiple entry values
646 |                 elif typ == 'L':
647 |                     if name not in self._dtypes:
648 |                         self._dtypes[name] = "bool"
649 |                     if value in b'TyTt':
650 |                         value = True
651 |                     elif value in b'NnFf':
652 |                         value = False
653 |                     # '?' indicates an empty value, convert this to NaN
654 |                     else:
655 |                         value = self._na
656 | 
657 |                 # Floating points are also stored as strings.
658 |                 elif typ == 'F':
659 |                     if name not in self._dtypes:
660 |                         self._dtypes[name] = "float"
661 |                     try:
662 |                         value = float(value)
663 |                     except:
664 |                         value = float('nan')
665 | 
666 |                 else:
667 |                     err = 'Column type "{}" not yet supported.'
668 |                     raise ValueError(err.format(value))
669 | 
670 |                 result.append(value)
671 |             yield result
672 |     
673 | 


--------------------------------------------------------------------------------