├── .gitattributes
├── .gitignore
├── LICENSE.txt
├── README.md
├── TEXTADAPTER_DEV.md
├── buildscripts
    ├── condarecipe
    │   ├── bld.bat
    │   ├── build.sh
    │   ├── meta.yaml
    │   └── run_test.py
    ├── fetch-dependencies
    └── jenkins-build
├── docs
    ├── Makefile
    ├── TextAdapter.rst
    ├── conf.py
    ├── eula.rst
    ├── genfromtxt.rst
    ├── index.rst
    ├── install.rst
    ├── loadtxt.rst
    ├── make.bat
    ├── release-notes.rst
    └── textadapter_examples.rst
├── environment.yml
├── setup.py
├── setupegg.py
├── textadapter
    ├── __init__.py
    ├── _version.py
    ├── core
    │   ├── IO.pyx
    │   ├── Index.pyx
    │   ├── TextAdapter.pxd
    │   ├── TextAdapter.pyx
    │   ├── __init__.py
    │   ├── genfromtxt.py
    │   ├── index.h
    │   ├── io.h
    │   ├── io_functions.c
    │   ├── io_functions.h
    │   ├── json_tokenizer.c
    │   ├── json_tokenizer.h
    │   ├── loadtxt.py
    │   ├── text_adapter.c
    │   └── text_adapter.h
    ├── examples
    │   ├── README
    │   ├── basic.py
    │   ├── converter.py
    │   ├── fixed_width.py
    │   ├── gzip_ints.py
    │   ├── missing_values.py
    │   └── regex.py
    ├── lib
    │   ├── Converters.pyx
    │   ├── __init__.py
    │   ├── _stdint.h
    │   ├── converter_functions.c
    │   ├── converter_functions.h
    │   ├── errors.py
    │   ├── field_info.c
    │   ├── field_info.h
    │   ├── khash.h
    │   ├── kstring.c
    │   ├── kstring.h
    │   └── kvec.h
    └── tests
    │   ├── Makefile
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── data
    │       └── benchmarks.py
    │   ├── generate.py
    │   ├── test_TextAdapter.py
    │   ├── test_ints.c
    │   ├── test_io.py
    │   └── test_text_adapter.c
└── versioneer.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | textadapter/_version.py export-subst
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Editor temporary/working/backup files #
 2 | #########################################
 3 | .#*
 4 | [#]*#
 5 | *~
 6 | *$
 7 | *.bak
 8 | *.diff
 9 | *.org
10 | .project
11 | *.rej
12 | .settings/
13 | .*.sw[nop]
14 | .sw[nop]
15 | *.tmp
16 | 
17 | # Compiled source #
18 | ###################
19 | *.a
20 | *.com
21 | *.class
22 | *.dll
23 | *.exe
24 | *.o
25 | *.py[ocd]
26 | *.so
27 | 
28 | # Python files #
29 | ################
30 | # setup.py working directory
31 | build
32 | # sphinx build directory
33 | _build
34 | # setup.py dist directory
35 | dist
36 | doc/build
37 | doc/cdoc/build
38 | # Egg metadata
39 | *.egg-info
40 | # The shelf plugin uses this dir
41 | ./.shelf
42 | 
43 | # Patches #
44 | ###########
45 | *.patch
46 | *.diff
47 | 
48 | # OS generated files #
49 | ######################
50 | .DS_Store*
51 | .VolumeIcon.icns
52 | .fseventsd
53 | Icon?
54 | .gdb_history
55 | ehthumbs.db
56 | Thumbs.db
57 | 
58 | # Specific cython generated c files
59 | ######################
60 | textadapter/core/TextAdapter.c
61 | 
62 | # Generated data files for /tests and /examples
63 | textadapter/tests/data/fixedwidths
64 | textadapter/tests/data/floats
65 | textadapter/tests/data/ints
66 | textadapter/tests/data/ints.gz
67 | textadapter/tests/data/missingvalues
68 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2009-2016, Continuum Analytics, Inc.  and contributors All
 2 | rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | Redistributions of source code must retain the above copyright notice, this
 8 | list of conditions and the following disclaimer.
 9 | 
10 | Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 | 
14 | Neither the name of Continuum Analytics nor the names of any contributors
15 | may be used to endorse or promote products derived from this software
16 | without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
28 | THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Working Efficiently with Big Data in Text Formats Using Free Software
  2 | 
  3 | One of our first commercial software products at Continuum Analytics was a product called *IOPro* which we have sold continuously since 2009. Now we are releasing the code under a liberal open source license. 
  4 | 
  5 | Following the path of widely adopted projects like conda, Blaze, Dask, odo, Numba, Conda, Bokeh, datashader, DataShape, DyND and other software that Continuum has created, we hope that the code in IOPro becomes valuable to open source communities and data scientists.  
  6 | 
  7 | We don't only hope this code is useful to you, however, we also hope you—or your colleagues—will be able to enhance, to refine and to develop the code further to increase its utility for the entire Python world.
  8 | 
  9 | ## What IOPro does
 10 | 
 11 | IOPro loads NumPy arrays and Pandas DataFrames directly from files, SQL databases and NoSQL stores–including ones with millions (or billions) of rows. It provides a drop-in replacement for NumPy data loading functions but dramatically improves performance and starkly reduces memory overhead.
 12 | 
 13 | The key concept in our code is that we access data via *adapters* which are something like enhanced file handles or database cursors. An adapter does not read data directly into memory, but rather provides a mechanism to use familiar NumPy/Pandas slicing syntax to load manageable segments of a large dataset. Moreover, an adapter provides fine-grained control over exactly *how* data is eventually read into memory, whether using custom patterns for how a line of data is parsed, choosing the precise data type of a textually represented number, or exposing data as "calculated fields" (that is,  "virtual columns").
 14 | 
 15 | As well as local CSV, JSON or other textual data sources, IOPro can load data from Amazon S3 buckets. When accessing large datasets—especially ones too large to load into memory—from files that do not have fixed record sizes, IOPro's indexing feature allows users to seek to a specific collection of records tens, hundreds or thousands of times faster than is possible with a linear scan.
 16 | 
 17 | ## Our release schedule
 18 | 
 19 | The intial release of our open source code will be of the TextAdapter component that makes up the better part of the code in IOPro. This code will be renamed—straightforwardly enough—as **TextAdapter**. The project will live at https://github.com/ContinuumIO/TextAdapter. We will make this forked project available by October 15, 2016 under a BSD 3-Clause License. 
 20 | 
 21 | Continuum is evaluating the details of our release of the database adapters, but will definitely make the code (though possibly unrefined) available by December 31, 2016. Our main hesitations with releasing the database adapters is that the state of the art in Python database adapters has advanced considerably since 2009, and we do not want to advocate a codebase unless it is currently best-of-breed (at very least for some niche use case). At worst, we will still release the code as an historical artifact. Those projects will live at https://github.com/ContinuumIO/DBAdapter, https://github.com/ContinuumIO/PostgresAdapter, https://github.com/ContinuumIO/AccumuloAdapter, and https://github.com/ContinuumIO/MongoAdapter.
 22 | 
 23 | If you are a current paid customer of IOPro, and are due for renewal before January 1, 2017, your sales rep will get in touch with you for renewal arrangements. We will continue to monitor and reply to issues and discussion about these successor projects at their GitHub repositories.
 24 | 
 25 | Thank you to prior contributors at Continuum, especially Jay Bourque (jayvius), but notably also Francesc Alted (FrancescAlted), Óscar Villellas Guillén (ovillellas), Michael Kleehammer (mkleehammer) and Ilan Schnell (ilanschnell) for their wonderful contributions. Any remaining bugs are my responsibility alone as current maintainer of the project.
 26 | 
 27 | ## The Blaze ecosystem
 28 | 
 29 | As part of the open source release of TextAdapter, we plan to integrate TextAdapter into the Blaze ecosystem. Blaze itself, as well as odo, provide translation between data formats and querying of data within a large variety of formats. Putting TextAdapter clearly in this ecosystem will let an *adapter* act as one such data format, and hence leverage the indexing speedups and data massaging that TextAdapter provides.
 30 | 
 31 | 
 32 | 
 33 | 
 34 | ## TextAdapter
 35 | 
 36 | TextAdapter is a Python module containing optimized data adapters for
 37 | importing data from a variety of data sources into NumPy arrays and Pandas
 38 | DataFrame. Current data adapters include TextAdapter for JSON, free-form,
 39 | and CSV-like text files. DBAdapter, also based on IOPro, accesses
 40 | MongoAdapter for mongo databases, PostgresAdapter for PostgreSQL databases,
 41 | AccumuloAdapter for Accumulo databases, and an optimized pyodbc module for
 42 | accessing any relational database that supports the ODBC interface (SQL
 43 | Server, PostgreSQL, MySQL, etc).
 44 | 
 45 | ## Build Requirements
 46 | 
 47 | Building TextAdapter requires a number of dependencies. In addition to a
 48 | C/C++ dev environment, the following modules are needed, which can be
 49 | installed via conda.
 50 | 
 51 | * NumPy
 52 | * Pandas
 53 | * zlib 1.2.8 (C lib)
 54 | * pcre 8.31 (C lib)
 55 | 
 56 | ## Building Conda Package
 57 | 
 58 | Note: If building under Windows, make sure the following commands are issued
 59 | within the Visual Studio command prompt for version of Visual Studio that
 60 | matches the version of Python you're building for. Python 2.6 and 2.7 needs
 61 | Visual Studio 2008, Python 3.3 and 3.4 needs Visual Studio 2010, and Python
 62 | 3.5 needs Visual Studio 2015.
 63 | 
 64 | 1. Build TextAdapter using the following command:
 65 | 
 66 |   ```
 67 |   conda build buildscripts/condarecipe --python 3.5
 68 |   ```
 69 | 
 70 | 1. TextAdapter can now be installed from the built conda package:
 71 | 
 72 |   ```
 73 |   conda install textadapter --use-local
 74 |   ```
 75 | 
 76 | ## Building By Hand
 77 | 
 78 | Note: If building under Windows, make sure the following commands are issued
 79 | within the Visual Studio command prompt for version of Visual Studio that
 80 | matches the version of Python you're building for. Python 2.6 and 2.7 needs
 81 | Visual Studio 2008, Python 3.3 and 3.4 needs Visual Studio 2010, and Python
 82 | 3.5 needs Visual Studio 2015.
 83 | 
 84 | For building TextAdapter for local development/testing:
 85 | 
 86 | 1. Install most of the above dependencies into environment called
 87 |    'textadapter':
 88 | 
 89 |   ```
 90 |   conda env create -f environment.yml
 91 |   ```
 92 | 
 93 |    Be sure to activate new TextAdapter environment before proceeding.
 94 | 
 95 | 
 96 | 1. Build TextAdapter using Cython/distutils:
 97 | 
 98 |   ```
 99 |   python setup.py build_ext --inplace
100 |   ```
101 | 
102 | ## Testing
103 | 
104 | Tests can be run by calling the iopro module's test function. By default
105 | only the TextAdapter tests will be run:
106 | 
107 | ```python
108 | python -Wignore -c 'import textadapter; textadapter.test()'
109 | ```
110 | 
111 | (Note: `numpy.testing` might produce a FurtureWarning that is not directly
112 | relevant to these unit tests)
113 | 
114 | 
115 | Related projects
116 | ----------------
117 | 
118 | - DBAdapter (SQL derivatives): https://github.com/ContinuumIO/DBAdapter
119 | - PostgresAdapter (PostgreSQL): https://github.com/ContinuumIO/PostgresAdapter
120 | - AccumuloAdapter (Apache Accumulo): https://github.com/ContinuumIO/AccumuloAdapter
121 | - MongoAdapter (MongoDB): https://github.com/ContinuumIO/MongoAdapter
122 | 
123 | 
124 | ## Other open source tools
125 | 
126 | Other open source projects for interacting with large datasets provide either competitors or collaborative capabilities.  
127 | 
128 | * The **ParaText** from Wise Technology looks like a very promising approach to accelerating raw reads of CSV data. It doesn't currently provide regular expression matching nor as rich data typing as IOPro, but the raw reads are shockingly fast. Most importantly, perhaps, ParaText does not address indexing, so as fast as it is at linear scan, it remains stuck with big-O inefficiencies that TextAdapter addresses. I personally think that (optionally) utilizing the underlying reader of ParaText as a layer underneath TextAdapter would be a wonderful combination. Information about ParaText can be found at http://www.wise.io/tech/paratext
129 | 
130 | Database access is almost always I/O bound rather than CPU bound, and hence the likely wins are by switching to asynchronous frameworks. This *does* involve using a somewhat different programming style than synchronous adapters, but some recent ones look amazingly fast. I am not yet sure whether it is worthwhile to create IOPro style adapters around these `asyncio`-based interfaces.
131 | 
132 | * **asyncpg** is a database interface library designed specifically for PostgreSQL and Python/asyncio. asyncpg is an efficient, clean implementation of PostgreSQL server binary protocol. Information about asyncpg can be found at https://magicstack.github.io/asyncpg/current/.
133 | 
134 | * **Motor** presents a callback- or Future-based API for non-blocking access to MongoDB from Tornado or asyncio. Information about Motor can be found at http://motor.readthedocs.io/en/stable/.
135 | 


--------------------------------------------------------------------------------
/TEXTADAPTER_DEV.md:
--------------------------------------------------------------------------------
 1 | Notes on the Development and Design of the TextAdapter Module
 2 | =============================================================
 3 | 
 4 | The TextAdapter module was the first and most complicated IOPro data
 5 | adapter.  The rest of the data adapters loosely follow the design of the
 6 | TextAdapter module described below.
 7 | 
 8 | Key Ideas
 9 | ---------
10 | 
11 | The TextAdapter module supports parsing tab delimited text, text with fixed
12 | width fields, JSON text, and text whose fields can be desribed with regular
13 | expressions.
14 | 
15 | The guts of the TextAdapter module are written in C, with a Python interface
16 | implemented in Cython.
17 | 
18 | The IOPro interface for the data adapters (TextAdapter, MongoAdapter,
19 | PostgresAdapter, and AccumuloAdapter) are designed to be numpy array-like in
20 | that slicing on the adapter is used to retrieve subsets of data.  When the
21 | adapter object is first created, no data is actually read (except for a few
22 | records at the beginning of the input data to determine field types, number
23 | of fields, etc).
24 | 
25 | IOPro is generally optimized for memory usage over speed, although speed is
26 | definitely a primary goal too.  Data copying is kept to a minimum so that as
27 | much data as possible can be read into a numpy array.
28 | 
29 | A TextAdapter object contains an array of function pointers, one for each
30 | field, that point to conversion functions that are responsible for
31 | converting input text data to the final output value.
32 | 
33 | A TextAdapter object also contains a set of function pointers to IO related
34 | functions (open, seek, read, and close) reponsible for reading data from the
35 | data source.  Compressed data seek and read functions can also be set if
36 | source data is compressed.  By combining normal IO function pointers with
37 | compressed data seek/read function pointers, the TextAdapter module can
38 | easily handle any supported data source that is also compressed with one of
39 | the supported compression schemes (currently only gzip).
40 | 
41 | A TextAdapter object also contains a function pointer that points to a
42 | tokenizer function appropriate for the input text type.  The tokenizer
43 | function is responsible for parsing the input text data and calling
44 | process_token to convert text data into the final output data type for the
45 | current field.  Each text type has a tokenizer function.  Tokenizer
46 | functions are also implemented for parsing lines and records as single
47 | string values (a record can be multiple lines).
48 | 
49 | Key Low Level C Data Structures
50 | =======
51 | 
52 | TextAdapter (textadapter/core/text_adapter.h):
53 | 
54 |   Core struct for text parser.  Contains attributes for input text such as
55 |   delimiter character, comment character, etc.  tokenize field is a function
56 |   pointer to the tokenize function for parsing specific type of text (tab
57 |   delimited, fixed width, etc).  Also contains pointers to InputDatastruct
58 |   and TextAdapterBuffer described below.
59 | 
60 | InputData (textadapter/core/text_adapter.h):
61 | 
62 |   Contains function pointers for IO functions (open, read, seek, close) and
63 |   for compressed data read and seek functions.  Also contains a void *input
64 |   field for storing a data structure specific to each data source (C FILE
65 |   pointer, S3 bucket info, etc).
66 | 
67 | TextAdapterBuffer (textadapter/core/text_adapter.h):
68 | 
69 |   Main buffer for storing text data to be parsed.
70 | 
71 | Ideas for Future Optimizations
72 | =======
73 | 
74 | - The biggest performance gains could be had by incorporating some parallel
75 |   processing goodness.  The most natural way to split it up (this should
76 |   work for all the adapters) might be to have one thread/process that reads
77 |   the input data into the main buffer, and a second thread/process do the
78 |   actual parsing and converting of the data, and storing of the converted
79 |   data in the final numpy array.
80 | 
81 | - Another idea for a potential speedup might be to refactor the parsing
82 |   backend so that offsets for all the tokens for a field in the buffer are
83 |   returned, and then have separate loops for different field types, that
84 |   would power through all the tokens for a field and call the appropriate
85 |   conversion function (the key would be to decide outside of the loops which
86 |   loop+conversion function to execute, so that the conversion function would
87 |   be inlined inside each loop.  This is essentially how the Pandas CSV
88 |   reader works, but it would increase memory usage.  For example (in
89 |   python-like pseudocode but implemented at the C level:
90 | 
91 | ```
92 |   if field_type is integers:
93 |     for i in range(num_records):
94 |         convert_and_store_ints(field_token_offsets[i])
95 |   elif field_type is floats:
96 |     for i in range(num_records):
97 |         convert_and_store_floats(field_token_offsets[i])
98 | ```
99 | 


--------------------------------------------------------------------------------
/buildscripts/condarecipe/bld.bat:
--------------------------------------------------------------------------------
1 | %PYTHON% setup.py install
2 | if errorlevel 1 exit 1
3 | 


--------------------------------------------------------------------------------
/buildscripts/condarecipe/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | $PYTHON setup.py install
4 | 


--------------------------------------------------------------------------------
/buildscripts/condarecipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |   name: textadapter
 3 |   version: "2.0.0"
 4 | 
 5 | source:
 6 |   path: ../../
 7 | 
 8 | build:
 9 |   number: 0
10 | 
11 | requirements:
12 |   build:
13 |     - python
14 |     - numpy
15 |     - cython
16 | 
17 |     # zlib and pcre versions pinned to fix shared library issues
18 |     - zlib 1.2.8
19 |     - pcre 8.31
20 | 
21 |   run:
22 |     - python
23 |     - numpy
24 |     - pandas
25 |     - six
26 |     - ordereddict     [py26]
27 | 
28 | test:
29 |   requires:
30 |     - nose
31 |     - pytest
32 | 
33 |   imports:
34 |     - textadapter
35 |     - textadapter.core.TextAdapter
36 | 
37 | about:
38 |   home: https://github.com/ContinuumIO/TextAdapter
39 |   license: BSD
40 |   summary: python interface Amazon S3, and large data files
41 | 


--------------------------------------------------------------------------------
/buildscripts/condarecipe/run_test.py:
--------------------------------------------------------------------------------
1 | import textadapter
2 | 
3 | assert textadapter.test()
4 | 
5 | print('textadapter.__version__: %s' % textadapter.__version__)
6 | 


--------------------------------------------------------------------------------
/buildscripts/fetch-dependencies:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | IOPRO_INCLUDE_PATH=$HOME/anaconda/envs/iopro-test-np1.7/include/python2.7
 5 | 
 6 | WHERE="$HOME/dependencies/"
 7 | 
 8 | function mk_depend_path()
 9 | {
10 |     echo $WHERE$1
11 | }
12 | 
13 | if [ ! -d $(mk_depend_path '') ]; then
14 |     mkdir $(mk_depend_path '')
15 | fi
16 | 
17 | pushd $(mk_depend_path '') >/dev/null
18 | 
19 | 
20 | # pcre version 8.30
21 | if [ ! -d pcre-8.30 ]; then
22 |     if [ ! -f pcre-8.30.tar.gz ]; then
23 | 	wget ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-8.30.tar.gz
24 |     fi
25 | 
26 |     tar -zxvf pcre-8.30.tar.gz
27 |     pushd pcre-8.30 >/dev/null
28 |     ./configure
29 |     make
30 |     popd >/dev/null
31 | fi
32 | 
33 | if [ -f pcre-8.30.tar.gz ]; then
34 |     # leave it clean
35 |     rm pcre-8.30.tar.gz
36 | fi
37 | 
38 | 
39 | # zlib-1.2.7
40 | if [ ! -d zlib-1.2.7 ]; then
41 |     if [ ! -f zlib-1.2.7.tar.bz2 ]; then
42 | 	wget http://downloads.sourceforge.net/project/libpng/zlib/1.2.7/zlib-1.2.7.tar.bz2
43 |     fi
44 |     tar -jxvf zlib-1.2.7.tar.bz2
45 | 
46 |     pushd zlib-1.2.7 >/dev/null
47 |     ./configure
48 |     make
49 |     popd >/dev/null
50 | fi
51 | 
52 | if [ -f zlib-1.2.7.tar.bz2 ]; then
53 |     #leave it clean
54 |     rm zlib-1.2.7.tar.bz2
55 | fi
56 | 
57 | 
58 | IOPRO_INCLUDE_PATH=$(mk_depend_path pcre-8.30):$IOPRO_INCLUDE_PATH
59 | IOPRO_INCLUDE_PATH=$(mk_depend_path zlib-1.2.7):$IOPRO_INCLUDE_PATH
60 | export IOPRO_INCLUDE_PATH
61 | 
62 | echo 'IOPRO_INCLUDE_PATH=' $IOPRO_INCLUDE_PATH
63 | 
64 | IOPRO_LIBRARY_PATH=$(mk_depend_path pcre-8.30/.libs):$IOPRO_LIBRARY_PATH
65 | IOPRO_LIBRARY_PATH=$(mk_depend_path zlib-1.2.7):$IOPRO_LIBRARY_PATH
66 | export IOPRO_LIBRARY_PATH
67 | 
68 | echo 'IOPRO_LIBRARY_PATH=' $IOPRO_LIBRARY_PATH
69 | 
70 | LD_LIBRARY_PATH=$(mk_depend_path pcre-8.30/.libs):$LD_LIBRARY_PATH
71 | LD_LIBRARY_PATH=$(mk_depend_path zlib-1.2.7):$LD_LIBRARY_PATH
72 | export LD_LIBRARY_PATH
73 | 
74 | echo 'LD_LIBRARY_PATH=' $LD_LIBRARY_PATH
75 | 
76 | popd >/dev/null
77 | 
78 | printf '\n\nBuilding...\n'
79 | python setup.py build_ext --inplace --include-dirs=$IOPRO_INCLUDE_PATH --library-dirs=$IOPRO_LIBRARY_PATH || exit 1
80 | 
81 | exit
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/buildscripts/jenkins-build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PYTHON_VERSION=2.7
 4 | 
 5 | if [ "${PYTHON_VERSION}" == "" ]; then
 6 |     echo You must select a Python version with the PYTHON_VERSION variable.
 7 |     exit 1
 8 | fi
 9 | 
10 | # Start from scratch
11 | if [ -d build ]; then
12 |     rm -rf build
13 | fi
14 | mkdir build
15 | cd build
16 | 
17 | # Use conda to create a conda environment of the required
18 | # python version and containing the dependencies.
19 | export PYENV_PREFIX=${WORKSPACE}/build/pyenv
20 | rm -rf ${PYENV_PREFIX}
21 | ~/anaconda/bin/conda create --yes -p ${PYENV_PREFIX} anaconda python=${PYTHON_VERSION} numpy=1.7 || exit 1
22 | export PATH=${PYENV_PREFIX}/bin:${PATH}
23 | 
24 | # JNB: Get rid of any iopro that conda may have installed
25 | rm -rf ${PYENV_PREFIX}/lib/python2.7/site-packages/iopro*
26 | 
27 | # Get and build pcre lib
28 | if [ ! -f ${WORKSPACE}/pcre-8.30.tar.gz ]; then
29 |     cd ..
30 |     wget ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-8.30.tar.gz
31 |     cd build
32 | fi
33 | 
34 | tar -zxvf ../pcre-8.30.tar.gz
35 | cd pcre-8.30
36 | ./configure
37 | make
38 | cd ..
39 | 
40 | # Get and build gzip compression lib
41 | if [ ! -f ${WORKSPACE}/zlib-1.2.7.tar.bz2 ]; then
42 |     cd ..
43 |     wget http://downloads.sourceforge.net/project/libpng/zlib/1.2.7/zlib-1.2.7.tar.bz2
44 |     cd build
45 | fi
46 | 
47 | tar -jxvf ../zlib-1.2.7.tar.bz2
48 | cd zlib-1.2.7
49 | ./configure
50 | make
51 | cd ..
52 | 
53 | 
54 | # Set up include and lib paths since we're not installing in default system paths
55 | export IOPRO_INCLUDE_PATH=${WORKSPACE}/build/pcre-8.30:$IOPRO_INCLUDE_PATH
56 | export IOPRO_LIBRARY_PATH=${WORKSPACE}/build/pcre-8.30/.libs:$IOPRO_LIBRARY_PATH
57 | export IOPRO_INCLUDE_PATH=${WORKSPACE}/build/zlib-1.2.7:$IOPRO_INCLUDE_PATH
58 | export IOPRO_LIBRARY_PATH=${WORKSPACE}/build/zlib-1.2.7:$IOPRO_LIBRARY_PATH
59 | export IOPRO_INCLUDE_PATH=~/anaconda/include/python${PYTHON_VERSION}:$IOPRO_INCLUDE_PATH
60 | export IOPRO_LIBRARY_PATH=~/anaconda/lib:$IOPRO_LIBRARY_PATH
61 | 
62 | export LD_LIBRARY_PATH=${WORKSPACE}/build/pcre-8.30/.libs:$LD_LIBRARY_PATH
63 | export LD_LIBRARY_PATH=${WORKSPACE}/build/zlib-1.2.7:$LD_LIBRARY_PATH
64 | 
65 | cd ..
66 | python setup.py build_ext --inplace --include-dirs=$IOPRO_INCLUDE_PATH --library-dirs=$IOPRO_LIBRARY_PATH || exit 1
67 | python -c 'import textadapter; import sys; sys.exit(1 - textadapter.test(num_records=1000))'
68 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/IOPro.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/IOPro.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/IOPro"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/IOPro"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/docs/TextAdapter.rst:
--------------------------------------------------------------------------------
  1 | -----------
  2 | TextAdapter
  3 | -----------
  4 | 
  5 | .. contents::
  6 | 
  7 | The TextAdapter module reads CSV data and produces a NumPy array containing the
  8 | parsed data. The following features are currently implemented:
  9 | 
 10 | * The TextAdapter engine is written
 11 |   in C to ensure text is parsed as fast as data can be read from the source.
 12 |   Text is read and parsed in small chunks instead of reading entire data into
 13 |   memory at once, which enables very large files to be read and parsed without
 14 |   running out of memory.
 15 | 
 16 | * Python slicing notation can be used to specify a subset of records to be
 17 |   read from the data source, as well as a subset of fields.
 18 | 
 19 | * Fields can be specified in any one of three ways: by a delimiter character, 
 20 |   using fixed field widths, or by a regular expression. This enables a larger 
 21 |   variety of CSV-like and other types of text files to be parsed.
 22 | 
 23 | * A gzipped file can be parsed without having to uncompress it first. Parsing speed
 24 |   is about the same as an uncompressed version of same file.
 25 | 
 26 | * An index of record offsets in a file can be built to allow fast random access to
 27 |   records. This index can be saved to disk and loaded again later.
 28 | 
 29 | * Converter functions can be specified for converting parsed text to proper dtype
 30 |   for storing in NumPy array.
 31 | 
 32 | * The TextAdapter engine has automatic type inference so the user does not have to
 33 |   specify dtypes of the output array. The user can still specify dtypes manually if
 34 |   desired.
 35 | 
 36 | * Remote data stored in Amazon S3 can be read. An index can be built and stored
 37 |   with S3 data. Index can be read remotely, allowing for random access to S3 data.
 38 | 
 39 | Methods
 40 | -------
 41 | The TextAdapter module contains the following factory methods for creating TextAdapter objects:
 42 | 
 43 | **text_adapter** (source, parser='csv', compression=None, comment='#',
 44 |                   quote='"', num_records=0, header=0, field_names=True,
 45 |                   indexing=False, index_name=None, encoding='utf-8')
 46 | 
 47 |     | Create a text adapter for reading CSV, JSON, or fixed width
 48 |     | text files, or a text file defined by regular expressions.
 49 | 
 50 |     | source - filename, file object, StringIO object, BytesIO object, S3 key,
 51 |       http url, or python generator
 52 |     | parser - Type of parser for parsing text. Valid parser types are 'csv', 'fixed width', 'regex', and 'json'.
 53 |     | encoding - type of character encoding (current ascii and utf8 are supported)
 54 |     | compression - type of data compression (currently only gzip is supported)
 55 |     | comment - character used to indicate comment line
 56 |     | quote - character used to quote fields
 57 |     | num_records - limits parsing to specified number of records; defaults
 58 |       to all records
 59 |     | header - number of lines in file header; these lines are skipped when parsing
 60 |     | footer - number of lines in file footer; these lines are skipped when parsing
 61 |     | indexing - create record index on the fly as characters are read
 62 |     | index_name - name of file to write index to
 63 |     | output - type of output object (numpy array or pandas dataframe)
 64 | 
 65 | 
 66 | If parser is set to 'csv', additional parameters include:
 67 |     | delimiter - Delimiter character used to define fields in data source. Default is ','.
 68 | 
 69 | If parser is set to 'fixed_width', additional parameters include:
 70 |     | field_widths - List of field widths
 71 | 
 72 | If parser is set to 'regex', additional parameters include:
 73 |     | regex - Regular expression used to define records and fields in data source.
 74 |       See the regular expression example in the Advanced Usage section.
 75 | 
 76 | **s3_text_adapter** (access_key, secret_key, bucket_name, key_name, remote_s3_index=False)
 77 |                      parser='csv', compression=None, comment='#',
 78 |                      quote='"', num_records=0, header=0, field_names=True,
 79 |                      indexing=False, index_name=None, encoding='utf-8')
 80 | 
 81 |     | Create a text adapter for reading a text file from S3. Text file can be
 82 |     | CSV, JSON, fixed width, or defined by regular expressions
 83 | 
 84 | In addition to the arguments described for the text_adapter function above,
 85 | the s3_text_adapter function also has the following parameters:
 86 | 
 87 |     | access_key - AWS access key
 88 |     | secret_key - AWS secret key
 89 |     | bucket_name - name of S3 bucket
 90 |     | key_name - name of key in S3 bucket
 91 |     | remote_s3_index - use remote S3 index (index name must be key name + '.idx' extension)
 92 | 
 93 | 
 94 | The TextAdapter object returned by the text_adapter factory method contains the following methods:
 95 | 
 96 | **set_converter** (field, converter)
 97 |     | Set converter function for field
 98 | 
 99 |     | field - field to apply converter function
100 |     | converter - python function object
101 | 
102 | **set_missing_values** (missing_values)
103 |     | Set strings for each field that represents a missing value
104 | 
105 |     | missing_values - dict of field name or number,
106 |       and list of missing value strings
107 | 
108 |     Default missing values: 'NA', 'NaN', 'inf', '-inf', 'None', 'none', ''
109 | 
110 | **set_fill_values** (fill_values, loose=False)
111 |     | Set fill values for each field
112 | 
113 |     | fill_values - dict of field name or number, and fill value
114 |     | loose - If value cannot be converted, and value does not match
115 |       any of the missing values, replace with fill value anyway.
116 | 
117 |     Default fill values for each data type:
118 |     | int - 0
119 |     | float - numpy.nan
120 |     | char - 0
121 |     | bool - False
122 |     | object - numpy.nan
123 |     | string - numpy.nan
124 | 
125 | **create_index** (index_name=None, density=1)
126 |     | Create an index of record offsets in file
127 | 
128 |     | index_name - Name of file on disk used to store index. If None, index
129 |       will be created in memory but not saved.
130 |     | density - density of index. Value of 1 will index every record, value of
131 |       2 will index every other record, etc.
132 | 
133 | **to_array** ()
134 |     | Parses entire data source and returns data as NumPy array object
135 | 
136 | **to_dataframe** ()
137 |     | Parses entire data source and returns data as Pandas DataFrame object
138 | 
139 | The TextAdapter object contains the following properties:
140 | 
141 | **size** (readonly)
142 |     | Number of records in data source. This value is only set if entire data
143 |       source has been read or indexed, or number of recods was specified in
144 |       text_adapter factory method when creating object.
145 | 
146 | **field_count** (readonly)
147 |     | Number of fields in each record
148 | 
149 | **field_names**
150 |     | Field names to use when creating output NumPy array. Field names can be
151 |       set here before reading data or in text_adapter function with
152 |       field_names parameter.
153 | 
154 | **field_types**
155 |     | NumPy dtypes for each field, specified as a dict of fields and associated
156 |       dtype. (Example: {0:'u4', 1:'f8', 2:'S10'})
157 | 
158 | **field_filter**
159 |     | Fields in data source to parse, specified as a list of field numbers
160 |       or names (Examples: [0, 1, 2] or ['f1', 'f3', 'f5']). This filter stays
161 |       in effect until it is reset to empty list, or is overridden with array
162 |       slicing (Example: adapter[[0, 1, 3, 4]][:]).
163 | 
164 |     See the NumPy data types documentation for more details:
165 |       http://docs.continuum.io/anaconda/numpy/reference/arrays.dtypes.html
166 | 
167 | The TextAdapter object supports array slicing:
168 | 
169 |     | Read all records:
170 |       adapter[:]
171 | 
172 |     | Read first 100 records:
173 |       adapter[0:100]
174 | 
175 |     | Read last record (only if data has been indexed or entire dataset
176 |       has been read once before):
177 |       adapter[-1]
178 | 
179 |     | Read first field in all records by specifying field number:
180 |       adapter[0][:]
181 | 
182 |     | Read first field in all records by specifying field name:
183 |       adapter['f0'][:]
184 | 
185 |     | Read first and third fields in all records:
186 |       adapter[[0, 2]][:]
187 | 
188 | Basic Usage
189 | -----------
190 | 
191 | Create TextAdapter object for data source::
192 | 
193 |     >>> import iopro
194 |     >>> adapter = iopro.text_adapter('data.csv', parser='csv')
195 | 
196 | Parse text and store records in NumPy array using slicing notation::
197 | 
198 |     >>> # read all records
199 |     >>> array = adapter[:]
200 | 
201 |     >>> # read first ten records
202 |     >>> array = adapter[0:10]
203 | 
204 |     >>> # read last record
205 |     >>> array = adapter[-1]
206 | 
207 |     >>> # read every other record
208 |     >>> array = adapter[::2]
209 | 
210 | Advanced Usage
211 | --------------
212 | 
213 | user defined converter function for field 0::
214 | 
215 |     >>> import iopro
216 |     >>> import io
217 | 
218 |     >>> data = '1, abc, 3.3\n2, xxx, 9.9'
219 |     >>> adapter = iopro.text_adapter(io.StringIO(data), parser='csv', field_names=False)
220 | 
221 |     >>> # Override default converter for first field
222 |     >>> adapter.set_converter(0, lambda x: int(x)*2)
223 |     >>> adapter[:]
224 |     array([(2L, ' abc', 3.3), (4L, ' xxx', 9.9)],
225 |               dtype=[('f0', '<u8'), ('f1', 'S4'), ('f2', '<f8')])
226 | 
227 | overriding default missing and fill values::
228 | 
229 |     >>> import iopro
230 |     >>> import io
231 | 
232 |     >>> data = '1,abc,inf\n2,NA,9.9'
233 |     >>> adapter = iopro.text_adapter(io.StringIO(data), parser='csv', field_names=False)
234 | 
235 |     >>> # Define field dtypes (example: set field 1 to string object and field 2 to float)
236 |     >>> adapter.field_types = {1:'O', 2:'f4'}
237 | 
238 |     >>> # Define list of strings for each field that represent missing values
239 |     >>> adapter.set_missing_values({1:['NA'], 2:['inf']})
240 | 
241 |     >>> # Set fill value for missing values in each field
242 |     >>> adapter.set_fill_values({1:'xxx', 2:999.999})
243 |     >>> adapter[:]
244 |     array([(' abc', 999.9990234375), ('xxx', 9.899999618530273)],
245 |               dtype=[('f0', 'O'), ('f1', '<f4')])
246 | 
247 | creating and saving tuple of index arrays for gzip file, and reloading indices::
248 | 
249 |     >>> import iopro
250 |     >>> adapter = iopro.text_adapter('data.gz', parser='csv', compression='gzip')
251 | 
252 |     >>> # Build index of records and save index to disk.
253 |     >>> adapter.create_index(index_name='index_file')
254 | 
255 |     >>> # Create new adapter object and load index from disk.
256 |     >>> adapter = iopro.text_adapter('data.gz', parser='csv', compression='gzip', indexing=True, index_name='index_file')
257 | 
258 |     >>> # Read last record
259 |     >>> adapter[-1]
260 |     array([(100, 101, 102)],dtype=[('f0', '<u4'), ('f1', '<u4'), ('f2', '<u4')])
261 | 
262 | Use regular expression for finer control of extracting data::
263 | 
264 |     >>> import iopro
265 |     >>> import io
266 | 
267 |     >>> # Define regular expression to extract dollar amount, percentage, and month.
268 |     >>> # Each set of parentheses defines a field.
269 |     >>> data = '$2.56, 50%, September 20 1978\n$1.23, 23%, April 5 1981'
270 |     >>> regex_string = '([0-9]\.[0-9][0-9]+)\,\s ([0-9]+)\%\,\s ([A-Za-z]+)'
271 |     >>> adapter = iopro.text_adapter(io.StringIO(data), parser='regex', regex_string=regex_string, field_names=False, infer_types=False)
272 | 
273 |     >>> # set dtype of field to float
274 |     >>> adapter.field_types = {0:'f4', 1:'u4', 2:'S10'}
275 |     >>> adapter[:]
276 |     array([(2.56, 50L, 'September'), (1.23, 23L, 'April')],
277 |         dtype=[('f0', '<f8'), ('f1', '<u8'), ('f2', 'S9')])
278 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # IOPro documentation build configuration file, created by
  4 | # sphinx-quickstart on Thu Aug  9 10:20:31 2012.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #sys.path.insert(0, os.path.abspath('.'))
 20 | 
 21 | # -- General configuration -----------------------------------------------------
 22 | 
 23 | # If your documentation needs a minimal Sphinx version, state it here.
 24 | #needs_sphinx = '1.0'
 25 | 
 26 | # Add any Sphinx extension module names here, as strings. They can be extensions
 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 28 | extensions = []
 29 | 
 30 | # Add any paths that contain templates here, relative to this directory.
 31 | templates_path = ['_templates']
 32 | 
 33 | # The suffix of source filenames.
 34 | source_suffix = '.rst'
 35 | 
 36 | # The encoding of source files.
 37 | #source_encoding = 'utf-8-sig'
 38 | 
 39 | # The master toctree document.
 40 | master_doc = 'index'
 41 | 
 42 | # General information about the project.
 43 | project = u'IOPro'
 44 | copyright = u'2016, Continuum Analytics'
 45 | 
 46 | # The version info for the project you're documenting, acts as replacement for
 47 | # |version| and |release|, also used in various other places throughout the
 48 | # built documents.
 49 | #
 50 | # The short X.Y version.
 51 | version = '1.9'
 52 | # The full version, including alpha/beta/rc tags.
 53 | release = '1.9.0'
 54 | 
 55 | # The language for content autogenerated by Sphinx. Refer to documentation
 56 | # for a list of supported languages.
 57 | #language = None
 58 | 
 59 | # There are two options for replacing |today|: either, you set today to some
 60 | # non-false value, then it is used:
 61 | #today = ''
 62 | # Else, today_fmt is used as the format for a strftime call.
 63 | #today_fmt = '%B %d, %Y'
 64 | 
 65 | # List of patterns, relative to source directory, that match files and
 66 | # directories to ignore when looking for source files.
 67 | exclude_patterns = ['_build']
 68 | 
 69 | # The reST default role (used for this markup: `text`) to use for all documents.
 70 | #default_role = None
 71 | 
 72 | # If true, '()' will be appended to :func: etc. cross-reference text.
 73 | #add_function_parentheses = True
 74 | 
 75 | # If true, the current module name will be prepended to all description
 76 | # unit titles (such as .. function::).
 77 | #add_module_names = True
 78 | 
 79 | # If true, sectionauthor and moduleauthor directives will be shown in the
 80 | # output. They are ignored by default.
 81 | #show_authors = False
 82 | 
 83 | # The name of the Pygments (syntax highlighting) style to use.
 84 | pygments_style = 'sphinx'
 85 | 
 86 | # A list of ignored prefixes for module index sorting.
 87 | #modindex_common_prefix = []
 88 | 
 89 | 
 90 | # -- Options for HTML output ---------------------------------------------------
 91 | 
 92 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 93 | # a list of builtin themes.
 94 | html_theme = 'default'
 95 | 
 96 | # Theme options are theme-specific and customize the look and feel of a theme
 97 | # further.  For a list of options available for each theme, see the
 98 | # documentation.
 99 | #html_theme_options = {}
100 | 
101 | # Add any paths that contain custom themes here, relative to this directory.
102 | #html_theme_path = ['_themes']
103 | 
104 | # The name for this set of Sphinx documents.  If None, it defaults to
105 | # "<project> v<release> documentation".
106 | #html_title = None
107 | 
108 | # A shorter title for the navigation bar.  Default is the same as html_title.
109 | #html_short_title = None
110 | 
111 | # The name of an image file (relative to this directory) to place at the top
112 | # of the sidebar.
113 | #html_logo = None
114 | 
115 | # The name of an image file (within the static path) to use as favicon of the
116 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
117 | # pixels large.
118 | #html_favicon = None
119 | 
120 | # Add any paths that contain custom static files (such as style sheets) here,
121 | # relative to this directory. They are copied after the builtin static files,
122 | # so a file named "default.css" will overwrite the builtin "default.css".
123 | html_static_path = ['_static']
124 | 
125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
126 | # using the given strftime format.
127 | #html_last_updated_fmt = '%b %d, %Y'
128 | 
129 | # If true, SmartyPants will be used to convert quotes and dashes to
130 | # typographically correct entities.
131 | #html_use_smartypants = True
132 | 
133 | # Custom sidebar templates, maps document names to template names.
134 | #html_sidebars = {}
135 | 
136 | # Additional templates that should be rendered to pages, maps page names to
137 | # template names.
138 | #html_additional_pages = {}
139 | 
140 | # If false, no module index is generated.
141 | #html_domain_indices = True
142 | 
143 | # If false, no index is generated.
144 | #html_use_index = True
145 | 
146 | # If true, the index is split into individual pages for each letter.
147 | #html_split_index = False
148 | 
149 | # If true, links to the reST sources are added to the pages.
150 | #html_show_sourcelink = True
151 | 
152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
153 | #html_show_sphinx = True
154 | 
155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
156 | #html_show_copyright = True
157 | 
158 | # If true, an OpenSearch description file will be output, and all pages will
159 | # contain a <link> tag referring to it.  The value of this option must be the
160 | # base URL from which the finished HTML is served.
161 | #html_use_opensearch = ''
162 | 
163 | # This is the file name suffix for HTML files (e.g. ".xhtml").
164 | #html_file_suffix = None
165 | 
166 | # Output file base name for HTML help builder.
167 | htmlhelp_basename = 'IOProdoc'
168 | 
169 | 
170 | # -- Options for LaTeX output --------------------------------------------------
171 | 
172 | latex_elements = {
173 | # The paper size ('letterpaper' or 'a4paper').
174 | #'papersize': 'letterpaper',
175 | 
176 | # The font size ('10pt', '11pt' or '12pt').
177 | #'pointsize': '10pt',
178 | 
179 | # Additional stuff for the LaTeX preamble.
180 | #'preamble': '',
181 | }
182 | 
183 | # Grouping the document tree into LaTeX files. List of tuples
184 | # (source start file, target name, title, author, documentclass [howto/manual]).
185 | latex_documents = [
186 |   ('index', 'IOPro.tex', u'IOPro Documentation',
187 |    u'Continuum Analytics', 'manual'),
188 | ]
189 | 
190 | # The name of an image file (relative to this directory) to place at the top of
191 | # the title page.
192 | #latex_logo = None
193 | 
194 | # For "manual" documents, if this is true, then toplevel headings are parts,
195 | # not chapters.
196 | #latex_use_parts = False
197 | 
198 | # If true, show page references after internal links.
199 | #latex_show_pagerefs = False
200 | 
201 | # If true, show URL addresses after external links.
202 | #latex_show_urls = False
203 | 
204 | # Documents to append as an appendix to all manuals.
205 | #latex_appendices = []
206 | 
207 | # If false, no module index is generated.
208 | #latex_domain_indices = True
209 | 
210 | 
211 | # -- Options for manual page output --------------------------------------------
212 | 
213 | # One entry per manual page. List of tuples
214 | # (source start file, name, description, authors, manual section).
215 | man_pages = [
216 |     ('index', 'iopro', u'IOPro Documentation',
217 |      ['Continuum Analytics', 'Jay Bourque', 'David Mertz'], 1)
218 | ]
219 | 
220 | # If true, show URL addresses after external links.
221 | #man_show_urls = False
222 | 
223 | 
224 | # -- Options for Texinfo output ------------------------------------------------
225 | 
226 | # Grouping the document tree into Texinfo files. List of tuples
227 | # (source start file, target name, title, author,
228 | #  dir menu entry, description, category)
229 | texinfo_documents = [
230 |   ('index', 'IOPro', u'IOPro Documentation',
231 |    u'Continuum Analytics', 'IOPro', 'One line description of project.',
232 |    'Miscellaneous'),
233 | ]
234 | 
235 | # Documents to append as an appendix to all manuals.
236 | #texinfo_appendices = []
237 | 
238 | # If false, no module index is generated.
239 | #texinfo_domain_indices = True
240 | 
241 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
242 | #texinfo_show_urls = 'footnote'
243 | 


--------------------------------------------------------------------------------
/docs/eula.rst:
--------------------------------------------------------------------------------
  1 | ================================
  2 | IOPro END USER LICENSE AGREEMENT
  3 | ================================
  4 | 
  5 | IOPro ("the Software Product") and accompanying documentation is licensed and
  6 | not sold. The Software Product is protected by copyright laws and treaties, as
  7 | well as laws and treaties related to other forms of intellectual property.
  8 | Continuum Analytics, Inc. or its subsidiaries, affiliates, and suppliers
  9 | (collectively "Continuum") own intellectual property rights in the Software
 10 | Product. The Licensee's ("you" or "your") license to download, use, copy, or
 11 | change the Software Product is subject to these rights and to all the terms
 12 | and conditions of this End User License Agreement ("Agreement").
 13 | 
 14 | Acceptance
 15 | ==========
 16 | 
 17 | YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS AGREEMENT BY SELECTING
 18 | THE "ACCEPT" OPTION AND DOWNLOADING THE SOFTWARE PRODUCT OR BY INSTALLING,
 19 | USING, OR COPYING THE SOFTWARE PRODUCT. YOU MUST AGREE TO ALL OF THE TERMS OF
 20 | THIS AGREEMENT BEFORE YOU WILL BE ALLOWED TO DOWNLOAD THE SOFTWARE PRODUCT. IF
 21 | YOU DO NOT AGREE TO ALL OF THE TERMS OF THIS AGREEMENT, YOU MUST SELECT
 22 | "DECLINE" AND YOU MUST NOT INSTALL, USE, OR COPY THE SOFTWARE PRODUCT.
 23 | 
 24 | Trial Period
 25 | ============
 26 | 
 27 | You have the right to use IOPro on a single computer or group of computers for
 28 | 30 days with no license.   After 30 days you must purchase an appropriate
 29 | license to use the software on one or more machines or stop using the software
 30 | and remove it from all of your machines on which you installed the software.
 31 | 
 32 | License Grant
 33 | =============
 34 | 
 35 | This Agreement entitles you to install and use one copy of the Software
 36 | Product on as many machines as you will personally use.   The Software Product
 37 | is licensed to a particular user.  Only the user to whom the software is
 38 | licensed may use the software. You must obtain a license for as many users as
 39 | you wish to use the software.    In addition, you may make archival copies of
 40 | the Software Product Installer.
 41 | 
 42 | Right to Updates
 43 | ================
 44 | 
 45 | This license entitles you to updates to the Software Product for one year from
 46 | the time of payment.  The Software Product will continue to function and you
 47 | may continue to use The Software Product and any updates you have received for
 48 | as long as you would like however you will no longer be able to receive
 49 | updates from Continuum unless this License is renewed. Please contact
 50 | sales@continuum.io with any questions or concerns.
 51 | 
 52 | Restrictions on Transfer
 53 | ========================
 54 | 
 55 | Without first obtaining the express written consent of Continuum, you may not
 56 | assign your rights and obligations under this Agreement, or redistribute,
 57 | encumber, sell, rent, lease, sublicense, or otherwise transfer your rights to
 58 | the Software Product.
 59 | 
 60 | Restrictions on Use
 61 | ===================
 62 | 
 63 | You may not use, copy, or install the Software Product on any system where
 64 | more than one user will be able to use the software unless you have purchased
 65 | a license for each user of the system.  You may not decompile,
 66 | "reverse-engineer", disassemble, or otherwise attempt to derive the source
 67 | code for the Software Product.
 68 | 
 69 | Restrictions on Alteration
 70 | ==========================
 71 | 
 72 | You may not modify the Software Product or create any derivative work of the
 73 | Software Product or its accompanying documentation. Derivative works include
 74 | but are not limited to translations. You may not alter any files or libraries
 75 | in any portion of the Software Product.
 76 | 
 77 | Restrictions on Copying
 78 | =======================
 79 | 
 80 | You may not copy any part of the Software Product except to the extent that
 81 | licensed use inherently demands the creation of a temporary copy stored in
 82 | computer memory and not permanently affixed on storage medium. You may make
 83 | archival copies of the Software Product installer.
 84 | 
 85 | Limited Software Product Warranty
 86 | =================================
 87 | 
 88 | For a period of 60 days from the date of shipment or from the date that you
 89 | download the Software Product, as applicable, Continuum warrants that when
 90 | properly installed and used under normal conditions, the Software Product will
 91 | perform substantially as advertised.
 92 | 
 93 | Disclaimer of Warranties and Limitation of Liability
 94 | ====================================================
 95 | 
 96 | UNLESS OTHERWISE EXPLICITLY AGREED TO IN WRITING BY CONTINUUM, CONTINUUM MAKES
 97 | NO OTHER WARRANTIES, EXPRESS OR IMPLIED, IN FACT OR IN LAW, INCLUDING, BUT NOT
 98 | LIMITED TO, ANY IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A
 99 | PARTICULAR PURPOSE OTHER THAN AS SET FORTH IN THIS AGREEMENT OR IN THE LIMITED
100 | WARRANTY DOCUMENTS PROVIDED WITH THE SOFTWARE PRODUCT.
101 | 
102 | Continuum makes no warranty that the Software Product will meet your
103 | requirements or operate under your specific conditions of use. Continuum makes
104 | no warranty that operation of the Software Product will be secure, error free,
105 | or free from interruption. YOU MUST DETERMINE WHETHER THE SOFTWARE PRODUCT
106 | SUFFICIENTLY MEETS YOUR REQUIREMENTS FOR SECURITY AND UNINTERRUPTABILITY. YOU
107 | BEAR SOLE RESPONSIBILITY AND ALL LIABILITY FOR ANY LOSS INCURRED DUE TO
108 | FAILURE OF THE SOFTWARE PRODUCT TO MEET YOUR REQUIREMENTS. CONTINUUM WILL NOT,
109 | UNDER ANY CIRCUMSTANCES, BE RESPONSIBLE OR LIABLE FOR THE LOSS OF DATA ON ANY
110 | COMPUTER OR INFORMATION STORAGE DEVICE.ÿ UNDER NO CIRCUMSTANCES SHALL
111 | CONTINUUM, ITS DIRECTORS, OFFICERS, EMPLOYEES OR AGENTS BE LIABLE TO YOU OR
112 | ANY OTHER PARTY FOR INDIRECT, CONSEQUENTIAL, SPECIAL, INCIDENTAL, PUNITIVE, OR
113 | EXEMPLARY DAMAGES OF ANY KIND (INCLUDING LOST REVENUES OR PROFITS OR LOSS OF
114 | BUSINESS) RESULTING FROM THIS AGREEMENT, OR FROM THE FURNISHING, PERFORMANCE,
115 | INSTALLATION, OR USE OF THE SOFTWARE PRODUCT, WHETHER DUE TO A BREACH OF
116 | CONTRACT, BREACH OF WARRANTY, OR THE NEGLIGENCE OF CONTINUUM OR ANY OTHER
117 | PARTY, EVEN IF CONTINUUM IS ADVISED BEFOREHAND OF THE POSSIBILITY OF SUCH
118 | DAMAGES. TO THE EXTENT THAT THE APPLICABLE JURISDICTION LIMITS CONTINUUM'S
119 | ABILITY TO DISCLAIM ANY IMPLIED WARRANTIES, THIS DISCLAIMER SHALL BE EFFECTIVE
120 | TO THE MAXIMUM EXTENT PERMITTED.
121 | 
122 | Limitation of Remedies and Damages
123 | ==================================
124 | 
125 | Your remedy for a breach of this Agreement or of any warranty included in this
126 | Agreement is the correction or replacement of the Software Product. Selection
127 | of whether to correct or replace shall be solely at the discretion of
128 | Continuum. Continuum reserves the right to substitute a functionally
129 | equivalent copy of the Software Product as a replacement. If Continuum is
130 | unable to provide a replacement or substitute Software Product or corrections
131 | to the Software Product, your sole alternate remedy shall be a refund of the
132 | purchase price for the Software Product exclusive of any costs for shipping
133 | and handling. Any claim must be made within the applicable warranty period.
134 | All warranties cover only defects arising under normal use and do not include
135 | malfunctions or failure resulting from misuse, abuse, neglect, alteration,
136 | problems with electrical power, acts of nature, unusual temperatures or
137 | humidity, improper installation, or damage determined by Continuum  to have
138 | been caused by you. All limited warranties on the Software Product are granted
139 | only to you and are non-transferable. You agree to indemnify and hold
140 | Continuum harmless from all claims, judgments, liabilities, expenses, or costs
141 | arising from your breach of this Agreement and/or acts or omissions.
142 | 
143 | Governing Law, Jurisdiction and Costs
144 | =====================================
145 | 
146 | This Agreement is governed by the laws of Texas, without regard to Texas's
147 | conflict or choice of law provisions.
148 | 
149 | Export Regulations
150 | ==================
151 | 
152 | Any use or distribution of IOPro is made under conditions that the user and/or
153 | distributor is in full compliance with all export and other governing laws of
154 | the United States of America, including full and ongoing compliance with the
155 | Export Administration Regulations (EAR) of the United States Department of
156 | Commerce. See www.commerce.gov/ and
157 | http://www.bis.doc.gov/index.php/regulations/export-administration-regulations-ear.
158 | Use or distribution of Continuum software products to any persons, entities or
159 | countries currently under US sanctions is strictly prohibited.   IOPro is
160 | classified with an ECCN of 5D992 with no license required for export to
161 | non-embargoed countires.
162 | 
163 | The United States currently has embargoes against Cuba, Iran, North Korea,
164 | Sudan and Syria. The exportation, re-exportation, sale or supply, directly or
165 | indirectly, from the United States, or by a U.S. person wherever located, of
166 | any Continuum software to any of these countries is strictly prohibited
167 | without prior authorization by the United States Government  By accepting this
168 | Agreement, you represent to Continuum that you will comply with all applicable
169 | export regulations for IOPro.
170 | 
171 | 
172 | Severability
173 | ============
174 | 
175 | If any provision of this Agreement shall be held to be invalid or
176 | unenforceable, the remainder of this Agreement shall remain in full force and
177 | effect. To the extent any express or implied restrictions are not permitted by
178 | applicable laws, these express or implied restrictions shall remain in force
179 | and effect to the maximum extent permitted by such applicable laws.
180 | 


--------------------------------------------------------------------------------
/docs/genfromtxt.rst:
--------------------------------------------------------------------------------
  1 | ----------------
  2 | iopro.genfromtxt
  3 | ----------------
  4 | 
  5 | Load data from a text file, with missing values handled as specified.
  6 | 
  7 | Each line past the first `skip_header` lines is split at the `delimiter`
  8 | character, and characters following the `comments` character are discarded.
  9 | 
 10 | Parameters
 11 | ----------
 12 | fname : file or str
 13 |     File, filename, or generator to read.  If the filename extension is
 14 |     `.gz` or `.bz2`, the file is first decompressed. Note that
 15 |     generators must return byte strings in Python 3k.
 16 | dtype : dtype, optional
 17 |     Data type of the resulting array.
 18 |     If None, the dtypes will be determined by the contents of each
 19 |     column, individually.
 20 | comments : str, optional
 21 |     The character used to indicate the start of a comment.
 22 |     All the characters occurring on a line after a comment are discarded
 23 | delimiter : str, int, or sequence, optional
 24 |     The string used to separate values.  By default, any consecutive
 25 |     whitespaces act as delimiter.  An integer or sequence of integers
 26 |     can also be provided as width(s) of each field.
 27 | skip_header : int, optional
 28 |     The numbers of lines to skip at the beginning of the file.
 29 | skip_footer : int, optional
 30 |     The numbers of lines to skip at the end of the file
 31 | converters : variable, optional
 32 |     The set of functions that convert the data of a column to a value.
 33 |     The converters can also be used to provide a default value
 34 |     for missing data: ``converters = {3: lambda s: float(s or 0)}``.
 35 | missing_values : variable, optional
 36 |     The set of strings corresponding to missing data.
 37 | filling_values : variable, optional
 38 |     The set of values to be used as default when the data are missing.
 39 | usecols : sequence, optional
 40 |     Which columns to read, with 0 being the first.  For example,
 41 |     ``usecols = (1, 4, 5)`` will extract the 2nd, 5th and 6th columns.
 42 | names : {None, True, str, sequence}, optional
 43 |     If `names` is True, the field names are read from the first valid line
 44 |     after the first `skip_header` lines.
 45 |     If `names` is a sequence or a single-string of comma-separated names,
 46 |     the names will be used to define the field names in a structured dtype.
 47 |     If `names` is None, the names of the dtype fields will be used, if any.
 48 | excludelist : sequence, optional
 49 |     A list of names to exclude. This list is appended to the default list
 50 |     ['return','file','print']. Excluded names are appended an underscore:
 51 |     for example, `file` would become `file_`.
 52 | deletechars : str, optional
 53 |     A string combining invalid characters that must be deleted from the
 54 |     names.
 55 | defaultfmt : str, optional
 56 |     A format used to define default field names, such as "f%i" or "f_%02i".
 57 | autostrip : bool, optional
 58 |     Whether to automatically strip white spaces from the variables.
 59 | replace_space : char, optional
 60 |     Character(s) used in replacement of white spaces in the variables
 61 |     names. By default, use a '_'.
 62 | case_sensitive : {True, False, 'upper', 'lower'}, optional
 63 |     If True, field names are case sensitive.
 64 |     If False or 'upper', field names are converted to upper case.
 65 |     If 'lower', field names are converted to lower case.
 66 | unpack : bool, optional
 67 |     If True, the returned array is transposed, so that arguments may be
 68 |     unpacked using ``x, y, z = loadtxt(...)``
 69 | usemask : bool, optional
 70 |     If True, return a masked array.
 71 |     If False, return a regular array.
 72 | invalid_raise : bool, optional
 73 |     If True, an exception is raised if an inconsistency is detected in the
 74 |     number of columns.
 75 |     If False, a warning is emitted and the offending lines are skipped.
 76 | 
 77 | Returns
 78 | -------
 79 | out : ndarray
 80 |     Data read from the text file. If `usemask` is True, this is a
 81 |     masked array.
 82 | 
 83 | See Also
 84 | --------
 85 | iopro.loadtxt : equivalent function when no data is missing.
 86 | 
 87 | Notes
 88 | -----
 89 | * When spaces are used as delimiters, or when no delimiter has been given
 90 |   as input, there should not be any missing data between two fields.
 91 | * When the variables are named (either by a flexible dtype or with `names`,
 92 |   there must not be any header in the file (else a ValueError
 93 |   exception is raised).
 94 | * Individual values are not stripped of spaces by default.
 95 |   When using a custom converter, make sure the function does remove spaces.
 96 | 
 97 | Examples
 98 | ---------
 99 |     >>> import iopro
100 |     >>> from io import StringIO
101 | 
102 | Comma delimited file with mixed dtype
103 | 
104 |     >>> s = StringIO("1,1.3,abcde")
105 |     >>> data = iopro.genfromtxt(s, dtype=[('myint','i8'),('myfloat','f8'),
106 |     ... ('mystring','S5')], delimiter=",")
107 |     >>> data
108 |     array((1, 1.3, 'abcde'),
109 |           dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '|S5')])
110 | 
111 | Using dtype = None
112 | 
113 |     >>> s.seek(0) # needed for StringIO example only
114 |     >>> data = iopro.genfromtxt(s, dtype=None,
115 |     ... names = ['myint','myfloat','mystring'], delimiter=",")
116 |     >>> data
117 |     array((1, 1.3, 'abcde'),
118 |           dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '|S5')])
119 | 
120 | Specifying dtype and names
121 | 
122 |     >>> s.seek(0)
123 |     >>> data = iopro.genfromtxt(s, dtype="i8,f8,S5",
124 |     ... names=['myint','myfloat','mystring'], delimiter=",")
125 |     >>> data
126 |     array((1, 1.3, 'abcde'),
127 |           dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '|S5')])
128 | 
129 | An example with fixed-width columns
130 | 
131 |     >>> s = StringIO("11.3abcde")
132 |     >>> data = iopro.genfromtxt(s, dtype=None, names=['intvar','fltvar','strvar'],
133 |     ...     delimiter=[1,3,5])
134 |     >>> data
135 |     array((1, 1.3, 'abcde'),
136 |           dtype=[('intvar', '<i8'), ('fltvar', '<f8'), ('strvar', '|S5')])
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | -----
  2 | IOPro
  3 | -----
  4 | 
  5 | IOPro loads NumPy arrays (and Pandas DataFrames) directly from files, SQL
  6 | databases, and NoSQL stores--including ones with millions of rows--without
  7 | creating millions of temporary, intermediate Python objects, or requiring
  8 | expensive array resizing operations. 
  9 | 
 10 | IOPro provides a drop-in replacement for the 
 11 | NumPy functions :code:`loadtxt()` and :code:`genfromtxt()`, but dramatically
 12 | improves performance and reduces memory overhead.
 13 | 
 14 | The current version of IOPro 1.9 was released on July 30, 2016. 
 15 | 
 16 | How to get IOPro
 17 | ----------------
 18 | 
 19 | IOPro is included with `Anaconda Workgroup and Anaconda Enterprise
 20 | subscriptions <https://www.continuum.io/content/anaconda-subscriptions>`_.
 21 | 
 22 | To start a 30-day free trial just download and install the IOPro package.
 23 | 
 24 | If you already have `Anaconda <http://continuum.io/downloads.html>`_ (free
 25 | Python platform) or `Miniconda <http://conda.pydata.org/miniconda.html>`_
 26 | installed::
 27 | 
 28 |     conda update conda
 29 |     conda install iopro
 30 | 
 31 | If you do not have Anaconda installed, you can `download it
 32 | <http://continuum.io/downloads.html>`_.
 33 | 
 34 | For more information about IOPro please contact `sales@continuum.io
 35 | <mailto:sales@continuum.io>`_.
 36 | 
 37 | Requirements
 38 | ------------
 39 | 
 40 | * Python 2.7 or 3.4+
 41 | * NumPy 1.10+
 42 | 
 43 | Optional Python modules:
 44 | 
 45 | * Boto (for S3 support)
 46 | * Pandas (to use DataFrames)
 47 | 
 48 | What's new in version 1.9?
 49 | --------------------------
 50 | 
 51 | The documentation has been substantially updated for version 1.9.0. 
 52 | Numba has been removed and the code has been cleaned up, but no other 
 53 | features were added or removed. Some refactoring was done that didn't 
 54 | change functionality. We recommend that users not use older versions.
 55 | See :doc:`Release notes <release-notes>` for additional detail.
 56 | 
 57 | 
 58 | Getting started
 59 | ---------------
 60 | 
 61 | Some of the basic usage patterns look like these.  Create TextAdapter object
 62 | for data source::
 63 | 
 64 |     >>> import iopro
 65 |     >>> adapter = iopro.text_adapter('data.csv', parser='csv')
 66 | 
 67 | Define field dtypes (example: set field 0 to unsigned int and field 4 to
 68 | float)::
 69 | 
 70 |     >>> adapter.set_field_types({0: 'u4', 4:'f4'})
 71 | 
 72 | Parse text and store records in NumPy array using slicing notation::
 73 | 
 74 |     >>> # read all records
 75 |     >>> array = adapter[:]
 76 | 
 77 |     >>> # read first ten records
 78 |     >>> array = adapter[0:10]
 79 | 
 80 |     >>> # read last record
 81 |     >>> array = adapter[-1]
 82 | 
 83 |     >>> # read every other record
 84 |     >>> array = adapter[::2]
 85 | 
 86 | User guide
 87 | ----------
 88 | 
 89 | .. toctree::
 90 |     :maxdepth: 1
 91 | 
 92 |     install
 93 |     textadapter_examples
 94 |     eula
 95 |     release-notes
 96 | 
 97 | Reference guide
 98 | ---------------
 99 | 
100 | .. toctree::
101 |     :maxdepth: 1
102 | 
103 |     TextAdapter
104 |     loadtxt
105 |     genfromtxt
106 | 
107 | 
108 | Previous Versions
109 | -----------------
110 | 
111 | This documentation is provided for the use of our customers who have not yet upgraded 
112 | to the current version. 
113 | 
114 | NOTE: We recommend that users not use older versions of IOPro.
115 | 
116 | .. toctree::
117 |    :maxdepth: 1
118 | 
119 |    IOPro 1.8.0 <1.8.0/index>
120 | 


--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | If you do not already have Anaconda installed, please download it via the
 5 | `downloads page <http://continuum.io/downloads.html>`_ and install it.
 6 | 
 7 | IOPro is included with `Anaconda Workgroup and Anaconda Enterprise
 8 | subscriptions <https://www.continuum.io/content/anaconda-subscriptions>`_.
 9 | 
10 | To start a 30-day free trial just download and install the IOPro package.
11 | 
12 | If you already have `Anaconda <http://continuum.io/downloads.html>`_ (free
13 | Python platform) or `Miniconda <http://conda.pydata.org/miniconda.html>`
14 | installed::
15 | 
16 |     conda update conda
17 |     conda install iopro
18 | 
19 | If you do not have Anaconda installed, you can `download it
20 | <http://continuum.io/downloads.html>`_.
21 | 
22 | For more information about IOPro please contact `sales@continuum.io
23 | <mailto:sales@continuum.io>`_.
24 | 
25 | IOPro Update Instructions
26 | -------------------------
27 | 
28 | If you have Anaconda (free Python platform) installed, first update
29 | the conda package management tool to the latest version, then use conda
30 | to update the IOPro product installation::
31 | 
32 |     conda update conda
33 |     conda update iopro
34 | 
35 | Uninstall
36 | ---------
37 | 
38 | To uninstall using conda::
39 | 
40 |     conda remove iopro
41 | 
42 | 
43 | Installing license
44 | ------------------
45 | 
46 | The IOPro license can be installed with the graphical Anaconda Navigator license 
47 | manager or manually with your operating system. In your organization this may be 
48 | handled by your site administrator or IT department. Both installation methods 
49 | are explained in the :doc:`License installation </anaconda/license-installation>` 
50 | page.
51 | 


--------------------------------------------------------------------------------
/docs/loadtxt.rst:
--------------------------------------------------------------------------------
 1 | -------------
 2 | iopro.loadtxt
 3 | -------------
 4 | 
 5 | Load data from a text file.
 6 | 
 7 | Each row in the text file must have the same number of values.
 8 | 
 9 | Parameters
10 | ----------
11 | fname : file or str
12 |     File, filename, or generator to read.  If the filename extension is
13 |     ``.gz`` or ``.bz2``, the file is first decompressed. Note that
14 |     generators should return byte strings for Python 3k.
15 | dtype : data-type, optional
16 |     Data-type of the resulting array; default: float.  If this is a
17 |     record data-type, the resulting array will be 1-dimensional, and
18 |     each row will be interpreted as an element of the array.  In this
19 |     case, the number of columns used must match the number of fields in
20 |     the data-type.
21 | comments : str, optional
22 |     The character used to indicate the start of a comment;
23 |     default: '#'.
24 | delimiter : str, optional
25 |     The string used to separate values.  By default, this is any
26 |     whitespace.
27 | converters : dict, optional
28 |     A dictionary mapping column number to a function that will convert
29 |     that column to a float.  E.g., if column 0 is a date string:
30 |     ``converters = {0: datestr2num}``.  Converters can also be used to
31 |     provide a default value for missing data (but see also `iopro.genfromtxt`):
32 |     ``converters = {3: lambda s: float(s.strip() or 0)}``.  Default: None.
33 | skiprows : int, optional
34 |     Skip the first `skiprows` lines; default: 0.
35 | usecols : sequence, optional
36 |     Which columns to read, with 0 being the first.  For example,
37 |     ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns.
38 |     The default, None, results in all columns being read.
39 | unpack : bool, optional
40 |     If True, the returned array is transposed, so that arguments may be
41 |     unpacked using ``x, y, z = iopro.loadtxt(...)``.  When used with a record
42 |     data-type, arrays are returned for each field.  Default is False.
43 | ndmin : int, optional
44 |     The returned array will have at least `ndmin` dimensions.
45 |     Otherwise mono-dimensional axes will be squeezed.
46 |     Legal values: 0 (default), 1 or 2.
47 |     .. versionadded:: 1.6.0
48 | 
49 | Returns
50 | -------
51 | out : ndarray
52 |     Data read from the text file.
53 | 
54 | See Also
55 | --------
56 | iopro.genfromtxt : Load data with missing values handled as specified.
57 | 
58 | Examples
59 | --------
60 | 
61 | simple parse of StringIO object data
62 |     >>> import iopro
63 |     >>> from io import StringIO   # StringIO behaves like a file object
64 |     >>> c = StringIO("0 1\\n2 3")
65 |     >>> iopro.loadtxt(c)
66 |     >>> array([[ 0.,  1.],
67 |            [ 2.,  3.]])
68 | 
69 | set dtype of output array
70 |     >>> d = StringIO("M 21 72\\nF 35 58")
71 |     >>> iopro.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'),
72 |     ...                      'formats': ('S1', 'i4', 'f4')})
73 |     >>> array([('M', 21, 72.0), ('F', 35, 58.0)],
74 |           dtype=[('gender', '|S1'), ('age', '<i4'), ('weight', '<f4')])
75 | 
76 | set delimiter and columns to parse
77 |     >>> c = StringIO("1,0,2\\n3,0,4")
78 |     >>> x, y = iopro.loadtxt(c, delimiter=',', usecols=(0, 2), unpack=True)
79 |     >>> x
80 |     >>> array([ 1.,  3.])
81 |     >>> y
82 |     >>> array([ 2.,  4.])
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  linkcheck  to check all external links for integrity
 37 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 38 | 	goto end
 39 | )
 40 | 
 41 | if "%1" == "clean" (
 42 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 43 | 	del /q /s %BUILDDIR%\*
 44 | 	goto end
 45 | )
 46 | 
 47 | if "%1" == "html" (
 48 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 49 | 	if errorlevel 1 exit /b 1
 50 | 	echo.
 51 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 52 | 	goto end
 53 | )
 54 | 
 55 | if "%1" == "dirhtml" (
 56 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 57 | 	if errorlevel 1 exit /b 1
 58 | 	echo.
 59 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 60 | 	goto end
 61 | )
 62 | 
 63 | if "%1" == "singlehtml" (
 64 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "pickle" (
 72 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished; now you can process the pickle files.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "json" (
 80 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished; now you can process the JSON files.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "htmlhelp" (
 88 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 92 | .hhp project file in %BUILDDIR%/htmlhelp.
 93 | 	goto end
 94 | )
 95 | 
 96 | if "%1" == "qthelp" (
 97 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 98 | 	if errorlevel 1 exit /b 1
 99 | 	echo.
100 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
101 | .qhcp project file in %BUILDDIR%/qthelp, like this:
102 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\IOPro.qhcp
103 | 	echo.To view the help file:
104 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\IOPro.ghc
105 | 	goto end
106 | )
107 | 
108 | if "%1" == "devhelp" (
109 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
110 | 	if errorlevel 1 exit /b 1
111 | 	echo.
112 | 	echo.Build finished.
113 | 	goto end
114 | )
115 | 
116 | if "%1" == "epub" (
117 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
118 | 	if errorlevel 1 exit /b 1
119 | 	echo.
120 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "latex" (
125 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "text" (
133 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "man" (
141 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "texinfo" (
149 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
150 | 	if errorlevel 1 exit /b 1
151 | 	echo.
152 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
153 | 	goto end
154 | )
155 | 
156 | if "%1" == "gettext" (
157 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
158 | 	if errorlevel 1 exit /b 1
159 | 	echo.
160 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
161 | 	goto end
162 | )
163 | 
164 | if "%1" == "changes" (
165 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
166 | 	if errorlevel 1 exit /b 1
167 | 	echo.
168 | 	echo.The overview file is in %BUILDDIR%/changes.
169 | 	goto end
170 | )
171 | 
172 | if "%1" == "linkcheck" (
173 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
174 | 	if errorlevel 1 exit /b 1
175 | 	echo.
176 | 	echo.Link check complete; look for any errors in the above output ^
177 | or in %BUILDDIR%/linkcheck/output.txt.
178 | 	goto end
179 | )
180 | 
181 | if "%1" == "doctest" (
182 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
183 | 	if errorlevel 1 exit /b 1
184 | 	echo.
185 | 	echo.Testing of doctests in the sources finished, look at the ^
186 | results in %BUILDDIR%/doctest/output.txt.
187 | 	goto end
188 | )
189 | 
190 | :end
191 | 


--------------------------------------------------------------------------------
/docs/release-notes.rst:
--------------------------------------------------------------------------------
1 | ../CHANGELOG


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: textadapter
 2 | dependencies:
 3 | - ipython
 4 | - numpy
 5 | - pandas
 6 | - pytest
 7 | - cython
 8 | - pcre
 9 | - zlib
10 | - nose
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from distutils.core import setup, Command
  4 | from distutils.extension import Extension
  5 | from Cython.Distutils import build_ext
  6 | import numpy
  7 | import versioneer
  8 | 
  9 | 
 10 | class CleanInplace(Command):
 11 |     user_options = []
 12 | 
 13 |     def initialize_options(self):
 14 |         self.cwd = None
 15 | 
 16 |     def finalize_options(self):
 17 |         self.cwd = os.getcwd()
 18 | 
 19 |     def run(self):
 20 |         files = ['./textadapter/core/TextAdapter.c',
 21 |                  './textadapter/core/TextAdapter.so']
 22 |         for file in files:
 23 |             try:
 24 |                 os.remove(file)
 25 |             except OSError:
 26 |                 pass
 27 | 
 28 | 
 29 | def setup_text(include_dirs, lib_dirs):
 30 |     src = ['textadapter/core/TextAdapter.pyx',
 31 |            'textadapter/core/text_adapter.c',
 32 |            'textadapter/lib/converter_functions.c',
 33 |            'textadapter/core/io_functions.c',
 34 |            'textadapter/lib/field_info.c',
 35 |            'textadapter/core/json_tokenizer.c']
 36 | 
 37 |     if sys.platform == 'win32':
 38 |         zlib_lib = 'zlibstatic'
 39 |     else:
 40 |         zlib_lib = 'z'
 41 | 
 42 |     compile_args = []
 43 |     if '--debug' in sys.argv:
 44 |         if sys.platform == 'win32':
 45 |             compile_args.append('/DDEBUG_ADAPTER')
 46 |         else:
 47 |             compile_args.append('-DDEBUG_ADAPTER')
 48 | 
 49 |     libraries = ['pcre', zlib_lib]
 50 |     include_dirs = ['textadapter/core'] + include_dirs
 51 | 
 52 |     return Extension("textadapter.core.TextAdapter",
 53 |                      src,
 54 |                      include_dirs=include_dirs,
 55 |                      library_dirs=lib_dirs,
 56 |                      libraries=libraries,
 57 |                      extra_compile_args=compile_args)
 58 | 
 59 | 
 60 | def run_setup():
 61 |     include_dirs = [os.path.join('textadapter', 'lib'),
 62 |                     numpy.get_include()]
 63 |     if sys.platform == 'win32':
 64 |         include_dirs.append(os.path.join(sys.prefix, 'Library', 'include'))
 65 |     else:
 66 |         include_dirs.append(os.path.join(sys.prefix, 'include'))
 67 | 
 68 |     lib_dirs = []
 69 |     if sys.platform == 'win32':
 70 |         lib_dirs.append(os.path.join(sys.prefix, 'Library', 'lib'))
 71 |     else:
 72 |         lib_dirs.append(os.path.join(sys.prefix, 'lib'))
 73 | 
 74 |     ext_modules = []
 75 |     packages = ['textadapter', 'textadapter.lib', 'textadapter.tests']
 76 | 
 77 |     ext_modules.append(setup_text(include_dirs, lib_dirs))
 78 |     packages.append('textadapter.core')
 79 | 
 80 |     versioneer.versionfile_source = 'textadapter/_version.py'
 81 |     versioneer.versionfile_build = 'textadapter/_version.py'
 82 |     versioneer.tag_prefix = ''
 83 |     versioneer.parentdir_prefix = 'textadapter-'
 84 | 
 85 |     cmdclass = versioneer.get_cmdclass()
 86 |     cmdclass['build_ext'] = build_ext
 87 |     cmdclass['cleanall'] = CleanInplace
 88 | 
 89 |     setup(name='textadapter',
 90 |           version = versioneer.get_version(),
 91 |           description='optimized IO for NumPy/Blaze',
 92 |           author='Continuum Analytics',
 93 |           author_email='david.mertz@continuum.io',
 94 |           ext_modules=ext_modules,
 95 |           packages=packages,
 96 |           cmdclass=cmdclass)
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     run_setup()
101 | 


--------------------------------------------------------------------------------
/setupegg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Wrapper to run setup.py using setuptools."""
 3 | 
 4 | import os, sys
 5 | 
 6 | # now, import setuptools and call the actual setup
 7 | import setuptools
 8 | try:
 9 |     execfile('setup.py')
10 | except NameError:
11 |     exec( open('setup.py','rb').read() )
12 | 


--------------------------------------------------------------------------------
/textadapter/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     TextAdapter
 3 |     ~~~~~
 4 | 
 5 |     TextAdapter provides tools to interface large data files in a fast, memory-efficient way.
 6 | """
 7 | from __future__ import absolute_import
 8 | 
 9 | from textadapter._version import get_versions
10 | __version__ = get_versions()['version']
11 | del get_versions
12 | 
13 | from textadapter.core.TextAdapter import (ArrayDealloc, CSVTextAdapter, 
14 |                                           FixedWidthTextAdapter, JSONTextAdapter,
15 |                                           RegexTextAdapter, s3_text_adapter,
16 |                                           text_adapter)
17 | from textadapter.core.loadtxt import loadtxt
18 | from textadapter.core.genfromtxt import genfromtxt
19 | from textadapter.lib.errors import (AdapterException, AdapterIndexError,
20 |                                     ArgumentError, ConfigurationError,
21 |                                     DataIndexError, DataTypeError,
22 |                                     InternalInconsistencyError, NoSuchFieldError,
23 |                                     ParserError, SourceError, SourceNotFoundError)
24 | 
25 | 
26 | def test(verbosity=1, num_records=100000, results=[]):
27 |     from textadapter.tests.test_TextAdapter import run as run_textadapter_tests
28 |     result_text = run_textadapter_tests(verbosity=verbosity,
29 |                                         num_records=num_records)
30 |     results.append(result_text)
31 |     
32 |     from textadapter.tests.test_io import run as run_io_tests
33 |     result_text = run_io_tests(verbosity=verbosity)
34 |     results.append(result_text)
35 |     
36 |     for result in results:
37 |         if not result.wasSuccessful():
38 |             return False
39 |     return True
40 | 
41 | 


--------------------------------------------------------------------------------
/textadapter/_version.py:
--------------------------------------------------------------------------------
  1 | IN_LONG_VERSION_PY = True
  2 | # This file helps to compute a version number in source trees obtained from
  3 | # git-archive tarball (such as those provided by github's download-from-tag
  4 | # feature). Distribution tarballs (build by setup.py sdist) and build
  5 | # directories (produced by setup.py build) will contain a much shorter file
  6 | # that just contains the computed version number.
  7 | 
  8 | # This file is released into the public domain. Generated by
  9 | # versioneer-0.7+ (https://github.com/warner/python-versioneer)
 10 | 
 11 | # these strings will be replaced by git during git-archive
 12 | git_refnames = " (HEAD -> master, tag: v2.0.0)"
 13 | git_full = "53138c2277cdfcf32e127251313d4f77f81050aa"
 14 | 
 15 | GIT = "git"
 16 | 
 17 | 
 18 | import subprocess
 19 | import sys
 20 | 
 21 | def run_command(args, cwd=None, verbose=False):
 22 |     try:
 23 |         # remember shell=False, so use git.cmd on windows, not just git
 24 |         p = subprocess.Popen(args, stdout=subprocess.PIPE, cwd=cwd)
 25 |     except EnvironmentError:
 26 |         e = sys.exc_info()[1]
 27 |         if verbose:
 28 |             print("unable to run %s" % args[0])
 29 |             print(e)
 30 |         return None
 31 |     stdout = p.communicate()[0].strip()
 32 |     if sys.version >= '3':
 33 |         stdout = stdout.decode()
 34 |     if p.returncode != 0:
 35 |         if verbose:
 36 |             print("unable to run %s (error)" % args[0])
 37 |         return None
 38 |     return stdout
 39 | 
 40 | 
 41 | import sys
 42 | import re
 43 | import os.path
 44 | 
 45 | def get_expanded_variables(versionfile_source):
 46 |     # the code embedded in _version.py can just fetch the value of these
 47 |     # variables. When used from setup.py, we don't want to import
 48 |     # _version.py, so we do it with a regexp instead. This function is not
 49 |     # used from _version.py.
 50 |     variables = {}
 51 |     try:
 52 |         for line in open(versionfile_source,"r").readlines():
 53 |             if line.strip().startswith("git_refnames ="):
 54 |                 mo = re.search(r'=\s*"(.*)"', line)
 55 |                 if mo:
 56 |                     variables["refnames"] = mo.group(1)
 57 |             if line.strip().startswith("git_full ="):
 58 |                 mo = re.search(r'=\s*"(.*)"', line)
 59 |                 if mo:
 60 |                     variables["full"] = mo.group(1)
 61 |     except EnvironmentError:
 62 |         pass
 63 |     return variables
 64 | 
 65 | def versions_from_expanded_variables(variables, tag_prefix, verbose=False):
 66 |     refnames = variables["refnames"].strip()
 67 |     if refnames.startswith("$Format"):
 68 |         if verbose:
 69 |             print("variables are unexpanded, not using")
 70 |         return {} # unexpanded, so not in an unpacked git-archive tarball
 71 |     refs = set([r.strip() for r in refnames.strip("()").split(",")])
 72 |     for ref in list(refs):
 73 |         if not re.search(r'\d', ref):
 74 |             if verbose:
 75 |                 print("discarding '%s', no digits" % ref)
 76 |             refs.discard(ref)
 77 |             # Assume all version tags have a digit. git's %d expansion
 78 |             # behaves like git log --decorate=short and strips out the
 79 |             # refs/heads/ and refs/tags/ prefixes that would let us
 80 |             # distinguish between branches and tags. By ignoring refnames
 81 |             # without digits, we filter out many common branch names like
 82 |             # "release" and "stabilization", as well as "HEAD" and "master".
 83 |     if verbose:
 84 |         print("remaining refs: %s" % ",".join(sorted(refs)))
 85 |     for ref in sorted(refs):
 86 |         # sorting will prefer e.g. "2.0" over "2.0rc1"
 87 |         if ref.startswith(tag_prefix):
 88 |             r = ref[len(tag_prefix):]
 89 |             if verbose:
 90 |                 print("picking %s" % r)
 91 |             return { "version": r,
 92 |                      "full": variables["full"].strip() }
 93 |     # no suitable tags, so we use the full revision id
 94 |     if verbose:
 95 |         print("no suitable tags, using full revision id")
 96 |     return { "version": variables["full"].strip(),
 97 |              "full": variables["full"].strip() }
 98 | 
 99 | def versions_from_vcs(tag_prefix, versionfile_source, verbose=False):
100 |     # this runs 'git' from the root of the source tree. That either means
101 |     # someone ran a setup.py command (and this code is in versioneer.py, so
102 |     # IN_LONG_VERSION_PY=False, thus the containing directory is the root of
103 |     # the source tree), or someone ran a project-specific entry point (and
104 |     # this code is in _version.py, so IN_LONG_VERSION_PY=True, thus the
105 |     # containing directory is somewhere deeper in the source tree). This only
106 |     # gets called if the git-archive 'subst' variables were *not* expanded,
107 |     # and _version.py hasn't already been rewritten with a short version
108 |     # string, meaning we're inside a checked out source tree.
109 | 
110 |     try:
111 |         here = os.path.abspath(__file__)
112 |     except NameError:
113 |         # some py2exe/bbfreeze/non-CPython implementations don't do __file__
114 |         return {} # not always correct
115 | 
116 |     # versionfile_source is the relative path from the top of the source tree
117 |     # (where the .git directory might live) to this file. Invert this to find
118 |     # the root from __file__.
119 |     root = here
120 |     if IN_LONG_VERSION_PY:
121 |         for i in range(len(versionfile_source.split("/"))):
122 |             root = os.path.dirname(root)
123 |     else:
124 |         root = os.path.dirname(here)
125 |     if not os.path.exists(os.path.join(root, ".git")):
126 |         if verbose:
127 |             print("no .git in %s" % root)
128 |         return {}
129 | 
130 |     stdout = run_command([GIT, "describe", "--tags", "--always"],
131 |                          cwd=root)
132 |     if stdout is None:
133 |         return {}
134 |     if not stdout.startswith(tag_prefix):
135 |         if verbose:
136 |             print("tag '%s' doesn't start with prefix '%s'" % (stdout, tag_prefix))
137 |         return {}
138 |     tag = stdout[len(tag_prefix):]
139 |     stdout = run_command([GIT, "rev-parse", "HEAD"], cwd=root)
140 |     if stdout is None:
141 |         return {}
142 |     full = stdout.strip()
143 |     return {"version": tag, "full": full}
144 | 
145 | 
146 | def versions_from_parentdir(parentdir_prefix, versionfile_source, verbose=False):
147 |     if IN_LONG_VERSION_PY:
148 |         # We're running from _version.py. If it's from a source tree
149 |         # (execute-in-place), we can work upwards to find the root of the
150 |         # tree, and then check the parent directory for a version string. If
151 |         # it's in an installed application, there's no hope.
152 |         try:
153 |             here = os.path.abspath(__file__)
154 |         except NameError:
155 |             # py2exe/bbfreeze/non-CPython don't have __file__
156 |             return {} # without __file__, we have no hope
157 |         # versionfile_source is the relative path from the top of the source
158 |         # tree to _version.py. Invert this to find the root from __file__.
159 |         root = here
160 |         for i in range(len(versionfile_source.split("/"))):
161 |             root = os.path.dirname(root)
162 |     else:
163 |         # we're running from versioneer.py, which means we're running from
164 |         # the setup.py in a source tree. sys.argv[0] is setup.py in the root.
165 |         here = os.path.abspath(sys.argv[0])
166 |         root = os.path.dirname(here)
167 | 
168 |     # Source tarballs conventionally unpack into a directory that includes
169 |     # both the project name and a version string.
170 |     dirname = os.path.basename(root)
171 |     if not dirname.startswith(parentdir_prefix):
172 |         if verbose:
173 |             print("guessing rootdir is '%s', but '%s' doesn't start with prefix '%s'" %
174 |                   (root, dirname, parentdir_prefix))
175 |         return None
176 |     return {"version": dirname[len(parentdir_prefix):], "full": ""}
177 | 
178 | tag_prefix = ""
179 | parentdir_prefix = "textadapter-"
180 | versionfile_source = "textadapter/_version.py"
181 | 
182 | def get_versions(default={"version": "unknown", "full": ""}, verbose=False):
183 |     variables = { "refnames": git_refnames, "full": git_full }
184 |     ver = versions_from_expanded_variables(variables, tag_prefix, verbose)
185 |     if not ver:
186 |         ver = versions_from_vcs(tag_prefix, versionfile_source, verbose)
187 |     if not ver:
188 |         ver = versions_from_parentdir(parentdir_prefix, versionfile_source,
189 |                                       verbose)
190 |     if not ver:
191 |         ver = default
192 |     return ver
193 | 
194 | 


--------------------------------------------------------------------------------
/textadapter/core/IO.pyx:
--------------------------------------------------------------------------------
 1 | 
 2 | cdef InputData* open_s3(object data):
 3 |     """
 4 |     Set up read/seek functions for S3 data source
 5 |     """
 6 |     cdef InputData *input_data = <InputData*>calloc(1, sizeof(InputData))
 7 |     input_data.seek = <seek_func_ptr>&seek_s3
 8 |     input_data.read = <read_func_ptr>&read_s3
 9 |     input_data.close = <close_func_ptr>&close_s3
10 |     input_data.input = <void*>data
11 |     return input_data
12 | 
13 | cdef void close_s3(InputData *input_data):
14 |     """
15 |     Clean up InputData for S3 data source
16 |     """
17 |     if input_data != NULL:
18 |         free(input_data)
19 | 
20 | cdef AdapterError seek_s3(InputData *input, uint64_t offset):
21 |     """
22 |     Seek to offset in S3 data source
23 | 
24 |     Arguments:
25 |     input - InputData struct
26 |     offset - offset to seek to
27 |     """
28 |     if (input == NULL):
29 |         return ADAPTER_ERROR_SEEK;
30 | 
31 |     s3_input = <object>input.input
32 | 
33 |     s3_key = s3_input['s3_key']
34 |     if offset > (s3_key.size - input.header):
35 |         return ADAPTER_ERROR_SEEK_S3
36 | 
37 |     s3_input['offset'] = offset + input.header
38 |     return ADAPTER_SUCCESS
39 | 
40 | 
41 | cdef AdapterError read_s3(InputData *input, char *buffer, uint64_t buffer_len, uint64_t *num_bytes_read):
42 |     """
43 |     Read bytes from S3 data source and store in buffer.
44 | 
45 |     Arguments:
46 |     input - text adapter struct
47 |     buffer - output buffer for data read from S3
48 |     buffer_len - length of buffer
49 |     num_bytes_read - pointer to variable for storing number of bytes read from S3
50 |     """
51 |     if (input == NULL):
52 |         return ADAPTER_ERROR_SEEK;
53 | 
54 |     s3_input = <object>input.input
55 |     offset = s3_input['offset']
56 |     s3_key = s3_input['s3_key']
57 | 
58 |     if offset >= s3_key.size:
59 |         num_bytes_read[0] = 0
60 |         return ADAPTER_ERROR_READ_EOF
61 | 
62 |     if offset < 0:
63 |         return ADAPTER_ERROR_READ
64 | 
65 |     try:
66 |         data = s3_key.get_contents_as_string(headers={'Range' : 'bytes={0}-{1}'.format(offset, offset+buffer_len)})
67 |     except:
68 |         return ADAPTER_ERROR_READ_S3
69 |     data_len = len(data)
70 | 
71 |     if data_len > buffer_len:
72 |         data_len = buffer_len
73 | 
74 |     memcpy(buffer, <char*>data, data_len)
75 |     num_bytes_read[0] = data_len
76 | 
77 |     s3_input['offset'] = s3_input['offset'] + data_len
78 | 
79 |     return ADAPTER_SUCCESS
80 | 


--------------------------------------------------------------------------------
/textadapter/core/TextAdapter.pxd:
--------------------------------------------------------------------------------
  1 | cimport numpy
  2 | 
  3 | cdef extern from '_stdint.h':
  4 |     # Actual type lengths are defined in _stdint.h
  5 |     # Sizes here are just place holders
  6 |     ctypedef unsigned long long uint64_t
  7 |     ctypedef unsigned int uint32_t
  8 |     ctypedef unsigned short uint16_t
  9 |     ctypedef unsigned char uint8_t
 10 |     ctypedef long long int64_t
 11 |     ctypedef int int32_t
 12 |     ctypedef short int16_t
 13 |     ctypedef char int8_t
 14 |     uint64_t UINT64_MAX
 15 | 
 16 | cdef extern from 'string.h':
 17 |     void *memcpy(void *, void *, size_t)
 18 |     char *strncpy(char *, char *, size_t)
 19 |     void *memset(void *, int, size_t)
 20 | 
 21 | cdef extern from "Python.h":
 22 |     ctypedef struct PyObject
 23 |     ctypedef struct FILE
 24 |     FILE* PyFile_AsFile(object)
 25 | 
 26 | cdef extern from 'stdlib.h':
 27 |     void* calloc(size_t, size_t)
 28 |     void* malloc(size_t)
 29 |     void* realloc(void *, size_t)
 30 |     void free(void *)
 31 | 
 32 | 
 33 | cdef extern from "../lib/khash.h":
 34 | 
 35 |     ctypedef uint32_t khint_t
 36 |     ctypedef khint_t khiter_t
 37 |     ctypedef char* kh_cstr_t
 38 | 
 39 |     ctypedef struct kh_string_t:
 40 |         khint_t n_buckets, size, n_occupied, upper_bound
 41 |         khint_t *flags
 42 |         kh_cstr_t *keys
 43 |         PyObject **vals
 44 | 
 45 |     kh_string_t* kh_init_string()
 46 |     void kh_destroy_string(kh_string_t*)
 47 |     khint_t kh_get_string(kh_string_t*, kh_cstr_t)
 48 |     khint_t kh_put_string(kh_string_t*, kh_cstr_t, int*)
 49 |     khint_t kh_str_hash_func(const char *s)
 50 |     khint_t kh_exist(kh_string_t*, khint_t)
 51 | 
 52 | cdef extern from "numpy/arrayobject.h":
 53 |     object PyArray_NewFromDescr(object subtype, numpy.dtype descr,
 54 |                                 int nd, numpy.npy_intp* dims,
 55 |                                 numpy.npy_intp* strides, void* data,
 56 |                                 int flags, object obj)
 57 |     struct PyArray_Descr:
 58 |         int type_num, elsize
 59 |         char type
 60 | 
 61 |     cdef enum:
 62 |         NPY_MAXDIMS
 63 | 
 64 | 
 65 | cdef extern from "zlib.h":
 66 |     int inflateEnd(void *)
 67 | 
 68 | cdef extern from "io_functions.h":
 69 |     InputData* open_file(const char *filename)
 70 |     void close_file(InputData *input)
 71 |     AdapterError read_file(InputData *input, char *buffer, uint64_t len,
 72 |         uint64_t *num_bytes_read)
 73 |     AdapterError seek_file(InputData *input, uint64_t offset)
 74 | 
 75 |     InputData* open_memmap(char *data, size_t size)
 76 |     void close_memmap(InputData *input)
 77 |     AdapterError read_memmap(InputData *input, char *buffer, uint64_t len,
 78 |         uint64_t *num_bytes_read)
 79 |     AdapterError seek_memmap(InputData *input, uint64_t offset)
 80 | 
 81 |     AdapterError read_gzip(InputData *input, char *buffer, uint64_t len,
 82 |         uint64_t *num_bytes_read)
 83 |     AdapterError seek_gzip(InputData *input, uint64_t offset)
 84 | 
 85 |     void init_gzip(InputData *input)
 86 |     void close_gzip(InputData *input)
 87 | 
 88 | cdef extern from 'converter_functions.h':
 89 | 
 90 |     ctypedef enum ConvertError:
 91 |         CONVERT_SUCCESS
 92 |         CONVERT_ERROR_UNKNOWN
 93 |         CONVERT_ERROR_OVERFLOW
 94 |         CONVERT_ERROR_INPUT_TYPE
 95 |         CONVERT_ERROR_INPUT_SIZE
 96 |         CONVERT_ERROR_OUTPUT_SIZE
 97 |         CONVERT_ERROR_INPUT_STRING
 98 |         CONVERT_ERROR_USER_CONVERTER
 99 |         CONVERT_ERROR_OBJECT_CONVERTER
100 |         CONVERT_ERROR_NUMBA
101 |         CONVERT_ERROR_LAST
102 | 
103 |     ctypedef ConvertError (*converter_func_ptr)(void *, uint32_t, int,
104 |         void *, uint32_t, void *)
105 | 
106 |     ConvertError str2int_converter(void *input, uint32_t input_len,
107 |         int input_type, void *output, uint32_t output_len, void *arg)
108 |     ConvertError str2uint_converter(void *input, uint32_t input_len,
109 |         int input_type, void *output, uint32_t output_len, void *arg)
110 |     ConvertError str2float_converter(void *input, uint32_t input_len,
111 |         int input_type, void *output, uint32_t output_len, void *arg)
112 |     ConvertError str2str_converter(void *input, uint32_t input_len,
113 |         int input_type, void *output, uint32_t output_len, void *arg)
114 |     ConvertError str2complex_converter(void *input, uint32_t input_len,
115 |         int input_type, void *output, uint32_t output_len, void *arg)
116 | 
117 | cdef extern from 'index.h':
118 |     enum: UNCOMPRESSED_WINDOW_SIZE
119 |     enum: DEFAULT_INDEX_DENSITY
120 |     enum: GZIP_ACCESS_POINT_DISTANCE
121 | 
122 |     ctypedef struct RecordOffset:
123 |         uint64_t record_num
124 |         uint64_t offset
125 | 
126 |     ctypedef struct GzipIndexAccessPoint:
127 |         uint8_t bits
128 |         uint64_t compressed_offset
129 |         uint64_t uncompressed_offset
130 |         unsigned char window[UNCOMPRESSED_WINDOW_SIZE]
131 | 
132 |     ctypedef void (*indexer_func_ptr)(void *index, uint64_t record_num,
133 |         uint64_t record_offset)
134 |     ctypedef RecordOffset (*index_lookup_func_ptr)(void *index,
135 |         uint64_t record_num)
136 | 
137 |     ctypedef void (*add_gzip_access_point_func_ptr)(void *index,
138 |         unsigned char *buffer,
139 |         uint32_t compressed_offset, uint64_t uncompressed_offset,
140 |         int avail_in, int avail_out, uint8_t data_type)
141 | 
142 |     ctypedef void (*get_gzip_access_point_func_ptr)(void *index,
143 |         uint64_t offset, GzipIndexAccessPoint *point)
144 | 
145 | cdef extern from 'field_info.h':
146 |     ctypedef struct MissingValues:
147 |         char **missing_values
148 |         uint32_t *missing_value_lens
149 |         uint32_t num_missing_values
150 | 
151 |     ctypedef struct FillValue:
152 |         void *fill_value
153 |         int loose
154 | 
155 |     ctypedef struct FieldInfo:
156 |         char *name
157 |         converter_func_ptr converter
158 |         void *converter_arg
159 |         MissingValues missing_values
160 |         FillValue fill_value
161 |         uint32_t input_field_width
162 |         uint32_t output_field_size
163 |         int infer_type
164 | 
165 |     ctypedef struct FieldList:
166 |         uint32_t num_fields
167 |         FieldInfo *field_info
168 | 
169 |     void set_num_fields(FieldList *fields, uint32_t num_fields)
170 |     void init_missing_values(FieldList *fields, char *field_name,
171 |         uint32_t field_num, uint32_t num_missing_values)
172 |     void add_missing_value(FieldList *fields, char *field_name,
173 |         uint32_t field_num, char *missing_value, uint32_t missing_value_len)
174 |     void set_fill_value(FieldList *fields, char *field_name,
175 |         uint32_t field_num, void *fill_value, uint32_t fill_value_len, int loose)
176 |     uint32_t get_field_size(FieldList *fields, char *field_name, uint32_t field_num)
177 | 
178 |     ctypedef enum DefaultConverterFuncs:
179 |         UINT_CONVERTER_FUNC
180 |         INT_CONVERTER_FUNC
181 |         FLOAT_CONVERTER_FUNC
182 |         STRING_CONVERTER_FUNC
183 |         STRING_OBJECT_CONVERTER_FUNC
184 |         NUM_CONVERTER_FUNCS
185 | 
186 |     void set_field_width(FieldList *fields, uint32_t field, uint32_t width)
187 |     void reset_converters(FieldList *fields)
188 |     void set_converter(FieldList *fields, uint32_t field_num, char *field_name,
189 |         uint32_t output_field_size, converter_func_ptr converter, void *converter_arg)
190 |     int infer_types(FieldList *fields)
191 | 
192 | 
193 | cdef extern from 'text_adapter.h':
194 |     ctypedef enum AdapterError:
195 |         ADAPTER_SUCCESS
196 |         ADAPTER_ERROR_SEEK
197 |         ADAPTER_ERROR_SEEK_EOF
198 |         ADAPTER_ERROR_SEEK_S3
199 |         ADAPTER_ERROR_READ
200 |         ADAPTER_ERROR_READ_EOF
201 |         ADAPTER_ERROR_READ_S3
202 |         ADAPTER_ERROR_NO_FIELDS
203 |         ADAPTER_ERROR_CONVERT
204 |         ADAPTER_ERROR_INDEX
205 |         ADAPTER_ERROR_PROCESS_TOKEN
206 |         ADAPTER_ERROR_READ_TOKENS
207 |         ADAPTER_ERROR_READ_RECORDS
208 |         ADAPTER_ERROR_JSON
209 |         ADAPTER_ERROR_INVALID_CHAR_CODE
210 |         ADAPTER_ERROR_LAST
211 | 
212 | 
213 |     ctypedef AdapterError (*read_func_ptr)(void *input, char *buffer,
214 |         uint64_t len, uint64_t *num_bytes_read)
215 |     ctypedef AdapterError (*seek_func_ptr)(void *input, uint64_t offset)
216 |     ctypedef void (*close_func_ptr)(InputData *input)
217 |     ctypedef AdapterError (*tokenize_func_ptr)(text_adapter_t *adapter,
218 |         uint64_t num_tokens, uint64_t step, char **output,
219 |         uint64_t *num_tokens_found, int enable_index, uint64_t index_density)
220 | 
221 |     ctypedef struct InputData:
222 |         void *input
223 |         read_func_ptr read
224 |         seek_func_ptr seek
225 |         close_func_ptr close
226 |         void *compressed_input
227 |         char *compressed_prebuffer
228 |         read_func_ptr read_compressed
229 |         seek_func_ptr seek_compressed
230 |         get_gzip_access_point_func_ptr get_gzip_access_point
231 |         uint64_t header
232 |         uint64_t footer
233 |         uint64_t start_record
234 |         uint64_t start_offset
235 |         void *index
236 | 
237 |     ctypedef struct MemMapInput:
238 |         char *data
239 |         uint64_t size
240 |         uint64_t position
241 | 
242 |     ctypedef struct GzipInput:
243 |         z_stream *z
244 |         uint32_t compressed_bytes_processed
245 |         uint64_t uncompressed_bytes_processed
246 |         int buffer_refreshed
247 |         void *uncompressed_input
248 | 
249 |     ctypedef struct JsonTokenizerArgs:
250 |         JSON_checker_struct *jc
251 | 
252 |     ctypedef struct RegexTokenizerArgs:
253 |         pcre *pcre_regex
254 |         pcre_extra *extra_regex
255 | 
256 |     ctypedef struct ConvertErrorInfo:
257 |         ConvertError convert_result
258 |         char *token
259 |         uint64_t record_num
260 |         uint64_t field_num
261 | 
262 |     struct text_adapter_t:
263 |         char delim_char
264 |         char comment_char
265 |         char quote_char
266 |         char escape_char
267 |         uint64_t num_records
268 |         InputData *input_data
269 |         tokenize_func_ptr tokenize
270 |         void *tokenize_args
271 |         uint64_t *field_widths
272 |         void *index
273 |         uint64_t index_density
274 |         indexer_func_ptr indexer
275 |         index_lookup_func_ptr index_lookup
276 |         add_gzip_access_point_func_ptr add_gzip_access_point
277 |         int infer_types_mode
278 |         FieldList *fields
279 |         int group_whitespace_delims
280 |         int any_whitespace_as_delim
281 |         int skipblanklines
282 |         int reset_json_args
283 | 
284 |     AdapterError delim_tokenizer(text_adapter_t *adapter, uint64_t num_tokens,
285 |         uint64_t step, char **output, uint64_t *num_tokens_found,
286 |         int enable_index, uint64_t index_density)
287 |     AdapterError json_tokenizer(text_adapter_t *adapter, uint64_t num_tokens,
288 |         uint64_t step, char **output, uint64_t *num_tokens_found,
289 |         int enable_index, uint64_t index_density)
290 |     AdapterError json_record_tokenizer(text_adapter_t *adapter, uint64_t num_tokens,
291 |         uint64_t step, char **output, uint64_t *num_tokens_found,
292 |         int enable_index, uint64_t index_density)
293 |     AdapterError regex_tokenizer(text_adapter_t *adapter, uint64_t num_tokens,
294 |         uint64_t step, char **output, uint64_t *num_tokens_found,
295 |         int enable_index, uint64_t index_density)
296 |     AdapterError fixed_width_tokenizer(text_adapter_t *adapter,
297 |         uint64_t num_tokens, uint64_t step, char **output,
298 |         uint64_t *num_tokens_found, int enable_index, uint64_t index_density)
299 |     AdapterError record_tokenizer(text_adapter_t *adapter, uint64_t num_tokens,
300 |         uint64_t step, char **output, uint64_t *num_tokens_found,
301 |         int enable_index, uint64_t index_density)
302 |     AdapterError line_tokenizer(text_adapter_t *adapter, uint64_t num_tokens,
303 |         uint64_t step, char **output, uint64_t *num_tokens_found,
304 |         int enable_index, uint64_t index_density)
305 | 
306 |     AdapterError build_index(text_adapter_t *adapter)
307 |     AdapterError build_gzip_index(text_adapter_t *adapter)
308 | 
309 |     text_adapter_t* open_text_adapter(InputData *input_data)
310 | 
311 |     void close_text_adapter(text_adapter_t *adapter)
312 | 
313 |     AdapterError seek_record(text_adapter_t *t, uint64_t rec_num)
314 |     AdapterError seek_offset(text_adapter_t *t, uint64_t offset)
315 |     AdapterError read_records(text_adapter_t *adapter, uint64_t num_records,
316 |         uint64_t step, char *output, uint64_t *num_records_found)
317 | 
318 |     ConvertErrorInfo get_error_info()
319 | 
320 | 
321 | # NOTE: This is after "text_adapter.h" so that
322 | #       PCRE_STATIC gets defined before including pcre.h.
323 | #       This is necessary for the Windows build.
324 | cdef extern from "pcre.h":
325 |     struct pcre
326 |     struct pcre_extra
327 |     pcre* pcre_compile(char *, int, char **, int *, unsigned char *)
328 |     pcre_extra* pcre_study(pcre *, int, char **)
329 | 
330 | cdef extern from "zlib.h":
331 |     ctypedef struct z_stream:
332 |         pass
333 | 
334 | cdef extern from "json_tokenizer.h":
335 |     struct JSON_checker_struct
336 |     JSON_checker_struct* new_JSON_checker(int depth)
337 | 
338 | cdef extern converter_func_ptr default_converters[<unsigned int>NUM_CONVERTER_FUNCS]
339 | 


--------------------------------------------------------------------------------
/textadapter/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContinuumIO/TextAdapter/53138c2277cdfcf32e127251313d4f77f81050aa/textadapter/core/__init__.py


--------------------------------------------------------------------------------
/textadapter/core/index.h:
--------------------------------------------------------------------------------
 1 | #ifndef INDEX_H
 2 | #define INDEX_H
 3 | 
 4 | #include <_stdint.h>
 5 | 
 6 | 
 7 | /* buffer size of uncompressed gzip data */
 8 | #define UNCOMPRESSED_WINDOW_SIZE 32768
 9 | 
10 | /* Default index density value. Density value determines how many records
11 |    to skip between each indexed record */
12 | #define DEFAULT_INDEX_DENSITY 1000
13 | 
14 | /* Default distance in bytes between gzip access points */
15 | #define GZIP_ACCESS_POINT_DISTANCE 1024 * 1024
16 | 
17 | 
18 | typedef struct record_offset_t
19 | {
20 |     uint64_t record_num;
21 |     uint64_t offset;
22 | } RecordOffset;
23 | 
24 | 
25 | typedef struct gzip_index_access_point_t
26 | {
27 |     uint8_t bits;
28 |     uint64_t compressed_offset;
29 |     uint64_t uncompressed_offset;
30 |     unsigned char window[UNCOMPRESSED_WINDOW_SIZE];
31 | } GzipIndexAccessPoint;
32 | 
33 | 
34 | /* indexer function pointer type */
35 | typedef void (*indexer_func_ptr)(void *index, uint64_t record_num,
36 |     uint64_t record_offset);
37 | 
38 | typedef RecordOffset (*index_lookup_func_ptr)(void *index, uint64_t record_num);
39 | 
40 | /* add gzip access point function pointer type */
41 | typedef void (*add_gzip_access_point_func_ptr)(void *index,
42 |     unsigned char *buffer,
43 |     uint32_t compressed_offset,
44 |     uint64_t uncompressed_offset,
45 |     int avail_in,
46 |     int avail_out,
47 |     uint8_t data_type);
48 | 
49 | typedef void (*get_gzip_access_point_func_ptr)(void *index,
50 |     uint64_t offset,
51 |     GzipIndexAccessPoint *point);
52 | 
53 | void indexer_callback(void *index, uint64_t record_num, uint64_t record_offset);
54 | RecordOffset index_lookup_callback(void *index, uint64_t record_num);
55 | 
56 | void add_gzip_access_point_callback(void *index,
57 |     unsigned char *window,
58 |     uint32_t compressed_offset,
59 |     uint64_t uncompressed_offset,
60 |     int avail_in,
61 |     int avail_out,
62 |     uint8_t bits);
63 | 
64 | void get_gzip_access_point_callback(void *index,
65 |     uint64_t offset,
66 |     GzipIndexAccessPoint *point);
67 | 
68 | #endif
69 | 


--------------------------------------------------------------------------------
/textadapter/core/io.h:
--------------------------------------------------------------------------------
1 | #include "text_adapter.h"
2 | 
3 | AdapterError seek_s3(InputData *input, uint64_t offset);
4 | AdapterError read_s3(InputData *input, char *buffer, uint64_t buffer_len,
5 |     uint64_t *num_bytes_read);
6 | 


--------------------------------------------------------------------------------
/textadapter/core/io_functions.h:
--------------------------------------------------------------------------------
 1 | #ifndef IO_FUNCTIONS_H
 2 | #define IO_FUNCTIONS_H
 3 | 
 4 | #include "text_adapter.h"
 5 | 
 6 | /* default file read/seek functions */
 7 | InputData* open_file(const char *filename);
 8 | void close_file(InputData *input);
 9 | AdapterError seek_file(InputData *input, uint64_t offset);
10 | AdapterError read_file(InputData *input,
11 |     char *buffer, uint64_t len, uint64_t *num_bytes_read);
12 | 
13 | /* memmap read/seek functions */
14 | InputData* open_memmap(char *data, size_t size);
15 | void close_memmap(InputData *input);
16 | AdapterError seek_memmap(InputData *input, uint64_t offset);
17 | AdapterError read_memmap(InputData *input,
18 |     char *buffer, uint64_t len, uint64_t *num_bytes_read);
19 | 
20 | /* gzip read/seek functions */
21 | AdapterError seek_gzip(InputData *input, uint64_t offset);
22 | AdapterError read_gzip(InputData *input,
23 |     char *buffer, uint64_t len, uint64_t *num_bytes_read);
24 | 
25 | /* setup/teardown functions for gzip decompression data structures */
26 | void init_gzip(InputData *input);
27 | void close_gzip(InputData *input);
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/textadapter/core/json_tokenizer.c:
--------------------------------------------------------------------------------
  1 | /* Adapted from json.org JSON_checker */
  2 | 
  3 | /* 2007-08-24 */
  4 | 
  5 | /*
  6 | Copyright (c) 2005 JSON.org
  7 | 
  8 | Permission is hereby granted, free of charge, to any person obtaining a copy
  9 | of this software and associated documentation files (the "Software"), to deal
 10 | in the Software without restriction, including without limitation the rights
 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 | copies of the Software, and to permit persons to whom the Software is
 13 | furnished to do so, subject to the following conditions:
 14 | 
 15 | The above copyright notice and this permission notice shall be included in all
 16 | copies or substantial portions of the Software.
 17 | 
 18 | The Software shall be used for Good, not Evil.
 19 | 
 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 26 | SOFTWARE.
 27 | */
 28 | 
 29 | #include <stdlib.h>
 30 | #include "json_tokenizer.h"
 31 | 
 32 | #define true  1
 33 | #define false 0
 34 | 
 35 | /*
 36 |     Characters are mapped into these 31 character classes. This allows for
 37 |     a significant reduction in the size of the state transition table.
 38 | */
 39 | 
 40 | int
 41 | reject(JSON_checker jc)
 42 | {
 43 | /*
 44 |     Delete the JSON_checker object.
 45 | */
 46 |     free((void*)jc->stack);
 47 |     free((void*)jc);
 48 |     return false;
 49 | }
 50 | 
 51 | 
 52 | int
 53 | push(JSON_checker jc, int mode)
 54 | {
 55 | /*
 56 |     Push a mode onto the stack. Return false if there is overflow.
 57 | */
 58 |     jc->top += 1;
 59 |     if (jc->top >= jc->depth) {
 60 |         return false;
 61 |     }
 62 |     jc->stack[jc->top] = mode;
 63 |     return true;
 64 | }
 65 | 
 66 | 
 67 | int
 68 | pop(JSON_checker jc, int mode)
 69 | {
 70 | /*
 71 |     Pop the stack, assuring that the current mode matches the expectation.
 72 |     Return false if there is underflow or if the modes mismatch.
 73 | */
 74 |     if (jc->top < 0 || jc->stack[jc->top] != mode) {
 75 |         return false;
 76 |     }
 77 |     jc->top -= 1;
 78 |     return true;
 79 | }
 80 | 
 81 | 
 82 | JSON_checker
 83 | new_JSON_checker(int depth)
 84 | {
 85 | /*
 86 |     new_JSON_checker starts the checking process by constructing a JSON_checker
 87 |     object. It takes a depth parameter that restricts the level of maximum
 88 |     nesting.
 89 | 
 90 |     To continue the process, call JSON_checker_char for each character in the
 91 |     JSON text, and then call JSON_checker_done to obtain the final result.
 92 |     These functions are fully reentrant.
 93 | 
 94 |     The JSON_checker object will be deleted by JSON_checker_done.
 95 |     JSON_checker_char will delete the JSON_checker object if it sees an error.
 96 | */
 97 |     JSON_checker jc = (JSON_checker)malloc(sizeof(struct JSON_checker_struct));
 98 |     jc->state = GO;
 99 |     jc->depth = depth;
100 |     jc->top = -1;
101 |     jc->stack = (int*)calloc(depth, sizeof(int));
102 |     push(jc, MODE_DONE);
103 |     return jc;
104 | }
105 | 
106 | 
107 | int
108 | JSON_checker_char(JSON_checker jc, int next_char)
109 | {
110 | /*
111 |     After calling new_JSON_checker, call this function for each character (or
112 |     partial character) in your JSON text. It can accept UTF-8, UTF-16, or
113 |     UTF-32. It returns true if things are looking ok so far. If it rejects the
114 |     text, it deletes the JSON_checker object and returns false.
115 | */
116 |     int next_class, next_state;
117 | /*
118 |     Determine the character's class.
119 | */
120 |     if (next_char < 0) {
121 |         return reject(jc);
122 |     }
123 |     if (next_char >= 128) {
124 |         next_class = C_ETC;
125 |     } else {
126 |         next_class = ascii_class[next_char];
127 |         if (next_class <= __) {
128 |             return reject(jc);
129 |         }
130 |     }
131 | /*
132 |     Get the next state from the state transition table.
133 | */
134 |     next_state = state_transition_table[jc->state][next_class];
135 |     if (next_state >= 0) {
136 | /*
137 |     Change the state.
138 | */
139 |         jc->state = next_state;
140 |     } else {
141 | /*
142 |     Or perform one of the actions.
143 | */
144 |         switch (next_state) {
145 | /* empty } */
146 |         case -33:
147 |             jc->state = IN;
148 |             break;
149 |         case -9:
150 |             if (!pop(jc, MODE_KEY)) {
151 |                 return reject(jc);
152 |             }
153 |             jc->state = OK;
154 |             break;
155 | 
156 | /* } */ case -8:
157 |             if (!pop(jc, MODE_OBJECT)) {
158 |                 return reject(jc);
159 |             }
160 |             jc->state = OK;
161 |             break;
162 | 
163 | /* ] */ case -7:
164 |             if (!pop(jc, MODE_ARRAY)) {
165 |                 return reject(jc);
166 |             }
167 |             jc->state = OK;
168 |             break;
169 | 
170 | /* { */ case -6:
171 |             if (!push(jc, MODE_KEY)) {
172 |                 return reject(jc);
173 |             }
174 |             jc->state = OB;
175 |             break;
176 | 
177 | /* [ */ case -5:
178 |             if (!push(jc, MODE_ARRAY)) {
179 |                 return reject(jc);
180 |             }
181 |             jc->state = AR;
182 |             break;
183 | 
184 | /* " */ case -4:
185 |             switch (jc->stack[jc->top]) {
186 |             case MODE_KEY:
187 |                 jc->state = CO;
188 |                 break;
189 |             case MODE_ARRAY:
190 |             case MODE_OBJECT:
191 |                 jc->state = OK;
192 |                 break;
193 |             default:
194 |                 return reject(jc);
195 |             }
196 |             break;
197 | 
198 | /* , */ case -3:
199 |             switch (jc->stack[jc->top]) {
200 |             case MODE_OBJECT:
201 | /*
202 |     A comma causes a flip from object mode to key mode.
203 | */
204 |                 if (!pop(jc, MODE_OBJECT) || !push(jc, MODE_KEY)) {
205 |                     return reject(jc);
206 |                 }
207 |                 jc->state = KE;
208 |                 break;
209 |             case MODE_ARRAY:
210 |                 jc->state = VA;
211 |                 break;
212 |             default:
213 |                 return reject(jc);
214 |             }
215 |             break;
216 | 
217 | /* : */ case -2:
218 | /*
219 |     A colon causes a flip from key mode to object mode.
220 | */
221 |             if (!pop(jc, MODE_KEY) || !push(jc, MODE_OBJECT)) {
222 |                 return reject(jc);
223 |             }
224 |             jc->state = VA;
225 |             break;
226 | /*
227 |     Bad action.
228 | */
229 |         default:
230 |             return reject(jc);
231 |         }
232 |     }
233 |     return true;
234 | }
235 | 
236 | 
237 | int
238 | JSON_checker_done(JSON_checker jc)
239 | {
240 | /*
241 |     The JSON_checker_done function should be called after all of the characters
242 |     have been processed, but only if every call to JSON_checker_char returned
243 |     true. This function deletes the JSON_checker and returns true if the JSON
244 |     text was accepted.
245 | */
246 |     int result = (jc->state == OK || jc->state == NO) && pop(jc, MODE_DONE);
247 |     reject(jc);
248 |     return result;
249 | }
250 | 


--------------------------------------------------------------------------------
/textadapter/core/json_tokenizer.h:
--------------------------------------------------------------------------------
  1 | /* Adapted from json.org JSON_checker */
  2 | 
  3 | #ifndef JSON_TOKENIZER_H
  4 | #define JSON_TOKENIZER_H
  5 | 
  6 | #define __   -1     /* the universal error code */
  7 | 
  8 | enum classes {
  9 |     C_SPACE,  /* space */
 10 |     C_NEWLINE, /* newline */
 11 |     C_WHITE,  /* other whitespace */
 12 |     C_LCURB,  /* {  */
 13 |     C_RCURB,  /* } */
 14 |     C_LSQRB,  /* [ */
 15 |     C_RSQRB,  /* ] */
 16 |     C_COLON,  /* : */
 17 |     C_COMMA,  /* , */
 18 |     C_QUOTE,  /* " */
 19 |     C_BACKS,  /* \ */
 20 |     C_SLASH,  /* / */
 21 |     C_PLUS,   /* + */
 22 |     C_MINUS,  /* - */
 23 |     C_POINT,  /* . */
 24 |     C_ZERO ,  /* 0 */
 25 |     C_DIGIT,  /* 123456789 */
 26 |     C_LOW_A,  /* a */
 27 |     C_LOW_B,  /* b */
 28 |     C_LOW_C,  /* c */
 29 |     C_LOW_D,  /* d */
 30 |     C_LOW_E,  /* e */
 31 |     C_LOW_F,  /* f */
 32 |     C_LOW_L,  /* l */
 33 |     C_LOW_N,  /* n */
 34 |     C_LOW_R,  /* r */
 35 |     C_LOW_S,  /* s */
 36 |     C_LOW_T,  /* t */
 37 |     C_LOW_U,  /* u */
 38 |     C_ABCDF,  /* ABCDF */
 39 |     C_E,      /* E */
 40 |     C_ETC,    /* everything else */
 41 |     NR_CLASSES
 42 | };
 43 | 
 44 | static int ascii_class[128] = {
 45 | /*
 46 |     This array maps the 128 ASCII characters into character classes.
 47 |     The remaining Unicode characters should be mapped to C_ETC.
 48 |     Non-whitespace control characters are errors.
 49 | */
 50 |     __,      __,      __,      __,      __,      __,      __,      __,
 51 |     __,      C_WHITE, C_NEWLINE, __,      __,      C_WHITE, __,      __,
 52 |     __,      __,      __,      __,      __,      __,      __,      __,
 53 |     __,      __,      __,      __,      __,      __,      __,      __,
 54 | 
 55 |     C_SPACE, C_ETC,   C_QUOTE, C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,
 56 |     C_ETC,   C_ETC,   C_ETC,   C_PLUS,  C_COMMA, C_MINUS, C_POINT, C_SLASH,
 57 |     C_ZERO,  C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT,
 58 |     C_DIGIT, C_DIGIT, C_COLON, C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,
 59 | 
 60 |     C_ETC,   C_ABCDF, C_ABCDF, C_ABCDF, C_ABCDF, C_E,     C_ABCDF, C_ETC,
 61 |     C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,
 62 |     C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_ETC,
 63 |     C_ETC,   C_ETC,   C_ETC,   C_LSQRB, C_BACKS, C_RSQRB, C_ETC,   C_ETC,
 64 | 
 65 |     C_ETC,   C_LOW_A, C_LOW_B, C_LOW_C, C_LOW_D, C_LOW_E, C_LOW_F, C_ETC,
 66 |     C_ETC,   C_ETC,   C_ETC,   C_ETC,   C_LOW_L, C_ETC,   C_LOW_N, C_ETC,
 67 |     C_ETC,   C_ETC,   C_LOW_R, C_LOW_S, C_LOW_T, C_LOW_U, C_ETC,   C_ETC,
 68 |     C_ETC,   C_ETC,   C_ETC,   C_LCURB, C_ETC,   C_RCURB, C_ETC,   C_ETC
 69 | };
 70 | 
 71 | 
 72 | /*
 73 |     The state codes.
 74 | */
 75 | enum states {
 76 |     GO,  /* start    */
 77 |     OK,  /* ok       */
 78 |     OB,  /* object   */
 79 |     KE,  /* key      */
 80 |     CO,  /* colon    */
 81 |     VA,  /* value    */
 82 |     AR,  /* array    */
 83 |     ST,  /* string   */
 84 |     ES,  /* escape   */
 85 |     U1,  /* u1       */
 86 |     U2,  /* u2       */
 87 |     U3,  /* u3       */
 88 |     U4,  /* u4       */
 89 |     MI,  /* minus    */
 90 |     ZE,  /* zero     */
 91 |     IN,  /* integer  */
 92 |     FR,  /* fraction */
 93 |     E1,  /* e        */
 94 |     E2,  /* ex       */
 95 |     E3,  /* exp      */
 96 |     T1,  /* tr       */
 97 |     T2,  /* tru      */
 98 |     T3,  /* true     */
 99 |     F1,  /* fa       */
100 |     F2,  /* fal      */
101 |     F3,  /* fals     */
102 |     F4,  /* false    */
103 |     N1,  /* nu       */
104 |     N2,  /* nul      */
105 |     N3,  /* null     */
106 |     NO,  /* next object */
107 |     NR_STATES
108 | };
109 | 
110 | 
111 | static int state_transition_table[NR_STATES][NR_CLASSES] = {
112 | /*
113 |     The state transition table takes the current state and the current symbol,
114 |     and returns either a new state or an action. An action is represented as a
115 |     negative number. A JSON text is accepted if at the end of the text the
116 |     state is OK and if the mode is MODE_DONE.
117 | 
118 |               newline white                                                    1-9  ABCDF  etc
119 |              space   |  |   {   }   [   ]   :   ,   "   \   /   +   -   .   0   |   a   b   c   d   e   f   l   n   r   s   t   u   |   E   |  */
120 | /*start  GO*/ { GO, __, GO, -6, __, -5, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __},
121 | /*ok     OK*/ { OK, NO, OK, __, -8, __, -7, __, -3, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __},
122 | /*object OB*/ { OB, __, OB, __, -9, __, __, __, __, ST, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __},
123 | /*key    KE*/ { KE, __, KE, __, __, __, __, __, __, ST, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __},
124 | /*colon  CO*/ { CO, __, CO, __, __, __, __, -2, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __},
125 | /*value  VA*/ { VA, VA, VA, -6, __, -5, __, __, __,-11, __, __, __, MI, __,-16,-10, __, __, __, __, __,-14, __,-14, __, __,-14, __, __, __, __},
126 | /*array  AR*/ { AR, __, AR, -6, __, -5, -7, __, __,-11, __, __, __, MI, __,-16,-10, __, __, __, __, __,-14, __,-14, __, __,-14, __, __, __, __},
127 | /*string ST*/ { ST, __, __, ST, ST, ST, ST, ST, ST, -4, ES, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST},
128 | /*escape ES*/ { __, __, __, __, __, __, __, __, __, ST, ST, ST, __, __, __, __, __, __, ST, __, __, __, ST, __, ST, ST, __, ST, U1, __, __, __},
129 | /*u1     U1*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, U2, U2, U2, U2, U2, U2, U2, U2, __, __, __, __, __, __, U2, U2, __},
130 | /*u2     U2*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, U3, U3, U3, U3, U3, U3, U3, U3, __, __, __, __, __, __, U3, U3, __},
131 | /*u3     U3*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, U4, U4, U4, U4, U4, U4, U4, U4, __, __, __, __, __, __, U4, U4, __},
132 | /*u4     U4*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, ST, ST, ST, ST, ST, ST, ST, ST, __, __, __, __, __, __, ST, ST, __},
133 | /*minus  MI*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __,-16, IN, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __},
134 | /*zero   ZE*/ { OK, __, OK, __, -8, __, -7, __, -3, __, __, __, __, __, FR, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __},
135 | /*int    IN*/ { OK, __, OK, __, -8, __, -7, __, -3, __, __, __, __, __, FR, IN, IN, __, __, __, __, E1, __, __, __, __, __, __, __, __, E1, __},
136 | /*frac   FR*/ { OK, __, OK, __, -8, __, -7, __, -3, __, __, __, __, __, __, FR, FR, __, __, __, __, E1, __, __, __, __, __, __, __, __, E1, __},
137 | /*e      E1*/ { __, __, __, __, __, __, __, __, __, __, __, __, E2, E2, __, E3, E3, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __},
138 | /*ex     E2*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, E3, E3, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __},
139 | /*exp    E3*/ { OK, __, OK, __, -8, __, -7, __, -3, __, __, __, __, __, __, E3, E3, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __},
140 | /*tr     T1*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, T2, __, __, __, __, __, __},
141 | /*tru    T2*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, T3, __, __, __},
142 | /*true   T3*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __,-15, __, __, __, __, __, __, __, __, __, __},
143 | /*fa     F1*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, F2, __, __, __, __, __, __, __, __, __, __, __, __, __, __},
144 | /*fal    F2*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, F3, __, __, __, __, __, __, __, __},
145 | /*fals   F3*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, F4, __, __, __, __, __},
146 | /*false  F4*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __,-15, __, __, __, __, __, __, __, __, __, __},
147 | /*nu     N1*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, N2, __, __, __},
148 | /*nul    N2*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, N3, __, __, __, __, __, __, __, __},
149 | /*null   N3*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __,-15, __, __, __, __, __, __, __, __},
150 | /*next   NO*/ { __, NO, __, -6, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __},
151 | };
152 | 
153 | 
154 | /*
155 |     These modes can be pushed on the stack.
156 | */
157 | enum modes {
158 |     MODE_ARRAY, 
159 |     MODE_DONE,  
160 |     MODE_KEY,   
161 |     MODE_OBJECT,
162 | };
163 | 
164 | typedef struct JSON_checker_struct {
165 |     int state;
166 |     int depth;
167 |     int top;
168 |     int* stack;
169 | } * JSON_checker;
170 | 
171 | 
172 | extern JSON_checker new_JSON_checker(int depth);
173 | extern int  JSON_checker_char(JSON_checker jc, int next_char);
174 | extern int  JSON_checker_done(JSON_checker jc);
175 | 
176 | #endif
177 | 


--------------------------------------------------------------------------------
/textadapter/core/loadtxt.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import operator
  3 | import textadapter
  4 | 
  5 | from numpy.compat import (
  6 |     asstr, bytes, basestring, unicode
  7 |     )
  8 | 
  9 | 
 10 | def loadtxt(fname, dtype=float, comments='#', delimiter=None,
 11 |             converters=None, skiprows=0, usecols=None, unpack=False,
 12 |             ndmin=0):
 13 |     """
 14 |     Load data from a text file.
 15 | 
 16 |     Each row in the text file must have the same number of values.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     fname : file or str
 21 |         File, filename, or generator to read.  If the filename extension is
 22 |         ``.gz`` or ``.bz2``, the file is first decompressed. Note that
 23 |         generators should return byte strings for Python 3k.
 24 |     dtype : data-type, optional
 25 |         Data-type of the resulting array; default: float.  If this is a
 26 |         record data-type, the resulting array will be 1-dimensional, and
 27 |         each row will be interpreted as an element of the array.  In this
 28 |         case, the number of columns used must match the number of fields in
 29 |         the data-type.
 30 |     comments : str, optional
 31 |         The character used to indicate the start of a comment;
 32 |         default: '#'.
 33 |     delimiter : str, optional
 34 |         The string used to separate values.  By default, this is any
 35 |         whitespace.
 36 |     converters : dict, optional
 37 |         A dictionary mapping column number to a function that will convert
 38 |         that column to a float.  E.g., if column 0 is a date string:
 39 |         ``converters = {0: datestr2num}``.  Converters can also be used to
 40 |         provide a default value for missing data (but see also `genfromtxt`):
 41 |         ``converters = {3: lambda s: float(s.strip() or 0)}``.  Default: None.
 42 |     skiprows : int, optional
 43 |         Skip the first `skiprows` lines; default: 0.
 44 |     usecols : sequence, optional
 45 |         Which columns to read, with 0 being the first.  For example,
 46 |         ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns.
 47 |         The default, None, results in all columns being read.
 48 |     unpack : bool, optional
 49 |         If True, the returned array is transposed, so that arguments may be
 50 |         unpacked using ``x, y, z = loadtxt(...)``.  When used with a record
 51 |         data-type, arrays are returned for each field.  Default is False.
 52 |     ndmin : int, optional
 53 |         The returned array will have at least `ndmin` dimensions.
 54 |         Otherwise mono-dimensional axes will be squeezed.
 55 |         Legal values: 0 (default), 1 or 2.
 56 |         .. versionadded:: 1.6.0
 57 | 
 58 |     Returns
 59 |     -------
 60 |     out : ndarray
 61 |         Data read from the text file.
 62 | 
 63 |     See Also
 64 |     --------
 65 |     load, fromstring, fromregex
 66 |     genfromtxt : Load data with missing values handled as specified.
 67 |     scipy.io.loadmat : reads MATLAB data files
 68 | 
 69 |     Notes
 70 |     -----
 71 |     This function aims to be a fast reader for simply formatted files.  The
 72 |     `genfromtxt` function provides more sophisticated handling of, e.g.,
 73 |     lines with missing values.
 74 | 
 75 |     Examples
 76 |     --------
 77 |     >>> from StringIO import StringIO   # StringIO behaves like a file object
 78 |     >>> c = StringIO("0 1\\n2 3")
 79 |     >>> np.loadtxt(c)
 80 |     array([[ 0.,  1.],
 81 |            [ 2.,  3.]])
 82 | 
 83 |     >>> d = StringIO("M 21 72\\nF 35 58")
 84 |     >>> np.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'),
 85 |     ...                      'formats': ('S1', 'i4', 'f4')})
 86 |     array([('M', 21, 72.0), ('F', 35, 58.0)],
 87 |           dtype=[('gender', '|S1'), ('age', '<i4'), ('weight', '<f4')])
 88 | 
 89 |     >>> c = StringIO("1,0,2\\n3,0,4")
 90 |     >>> x, y = np.loadtxt(c, delimiter=',', usecols=(0, 2), unpack=True)
 91 |     >>> x
 92 |     array([ 1.,  3.])
 93 |     >>> y
 94 |     array([ 2.,  4.])
 95 | 
 96 |     """
 97 | 
 98 |     user_converters = converters
 99 | 
100 |     whitespace_delims = False
101 |     if delimiter is None:
102 |         whitespace_delims = True
103 | 
104 |     compression = None
105 |     if isinstance(fname, basestring) and fname[-3:] == '.gz':
106 |         compression = 'gzip'
107 | 
108 |     try:
109 |         adapter = textadapter.text_adapter(fname, parser='csv', delimiter=delimiter,
110 |             comment=comments, header=skiprows, compression=compression, whitespace_delims=whitespace_delims,
111 |             field_names=False, infer_types=False)
112 |     except EOFError:
113 |         array = numpy.array([], dtype=numpy.int64, ndmin=ndmin)
114 |         if ndmin == 2:
115 |             array = array.T
116 |         return array
117 | 
118 |     if usecols is None:
119 |         usecols = [x for x in range(0, adapter.field_count)]
120 |     elif isinstance(usecols, numpy.ndarray):
121 |         usecols = usecols.tolist()
122 |     else:
123 |         usecols = list(usecols)
124 | 
125 |     # create valid dtype object
126 |     if isinstance(dtype, (list, tuple)):
127 |         dtype = [dt if isinstance(dt, tuple) else ('', dt) for dt in dtype]
128 |     dtype = numpy.dtype(dtype)
129 |     
130 |     # create list of dtypes to send to TextAdapter
131 |     if dtype.names is None:
132 |         # create list of homogenous scalar dtypes from single scalar dtype
133 |         numFields = len(usecols)
134 |         dtypes = [dtype]*numFields
135 |         fieldNames = None
136 |     else:
137 |         # create list of scalar dtypes from struct dtype
138 |         dtypes, fieldNames = unpack_dtype(dtype)
139 | 
140 |     if fieldNames is not None:
141 |         list_names = ['' for x in range(adapter.field_count)]
142 |         for i, col in enumerate(usecols):
143 |             list_names[col] = fieldNames[i]
144 |         adapter.field_names = list_names
145 | 
146 |     adapter.set_field_types(types=dict(zip(usecols, dtypes)))
147 | 
148 |     if converters is not None:
149 |         for field, converter in converters.items():
150 |             adapter.set_converter(field, converter)
151 | 
152 |     array = adapter[usecols][:]
153 |       
154 |     if dtype.fields is not None and numpy.object_ not in [dt[0] for dt in array.dtype.fields.values()]:
155 |         array.dtype = dtype
156 |     elif dtype.fields is None:
157 |         array.dtype = dtype
158 |     if dtype.names is None:
159 |         if adapter.field_count == 0:
160 |             array.shape = (adapter.size,)
161 |         else:
162 |             array.shape = (adapter.size, len(usecols))
163 |  
164 |     # Multicolumn data are returned with shape (1, N, M), i.e.
165 |     # (1, 1, M) for a single row - remove the singleton dimension there
166 |     if array.ndim == 3 and array.shape[:2] == (1, 1):
167 |         array.shape = (1, -1)
168 | 
169 |     # Verify that the array has at least dimensions `ndmin`.
170 |     # Check correctness of the values of `ndmin`
171 |     if not ndmin in [0, 1, 2]:
172 |         raise ValueError('Illegal value of ndmin keyword: %s' % ndmin)
173 | 
174 |     # Tweak the size and shape of the arrays - remove extraneous dimensions
175 |     if array.ndim > ndmin:
176 |         array = numpy.squeeze(array)
177 | 
178 |     # and ensure we have the minimum number of dimensions asked for
179 |     # - has to be in this order for the odd case ndmin=1, array.squeeze().ndim=0
180 |     if array.ndim < ndmin:
181 |         if ndmin == 1:
182 |             array = numpy.atleast_1d(array)
183 |         elif ndmin == 2:
184 |             array = numpy.atleast_2d(array).T
185 | 
186 |     if unpack:
187 |         if len(dtype) > 1:
188 |             # For structured arrays, return an array for each field.
189 |             return [array[field] for field in dtype.names]
190 |         else:
191 |             return array.T
192 |     else:
193 |         return array
194 | 
195 | 
196 | def unpack_dtype(dtype):
197 |     dtypes = []
198 |     names = []
199 |     for name in dtype.names:
200 |         if dtype.fields[name][0].names is None:
201 |             count = 1
202 |             shape = dtype.fields[name][0].shape
203 |             if len(shape) > 0:
204 |                 count = 1
205 |                 for s in shape:
206 |                     count = count * s
207 |             if count == 0 or count == 1:
208 |                 dtypes.append(dtype.fields[name][0].base)
209 |                 names.append(name)
210 |             else:
211 |                 for x in range(count):
212 |                     dtypes.append(dtype.fields[name][0].base)
213 |                     names.append('')
214 |         else:
215 |             nested_dtypes, nested_names = unpack_dtype(dtype.fields[name][0])
216 |             for dt in nested_dtypes:
217 |                 dtypes.append(dt)
218 |             for n in nested_names:
219 |                 names.append(n)
220 |     return dtypes, names
221 | 
222 | 
223 | 


--------------------------------------------------------------------------------
/textadapter/core/text_adapter.h:
--------------------------------------------------------------------------------
  1 | #ifndef TEXTADAPTER_H
  2 | #define TEXTADAPTER_H
  3 | 
  4 | #ifdef _WIN32
  5 | #define PCRE_STATIC
  6 | #endif
  7 | 
  8 | #include <stdio.h>
  9 | #include <zlib.h>
 10 | #include <pcre.h>
 11 | #include "converter_functions.h"
 12 | #include "index.h"
 13 | #include "field_info.h"
 14 | #include "json_tokenizer.h"
 15 | 
 16 | 
 17 | /* Buffer size for reading in compressed gzip data before uncompressing */
 18 | #define COMPRESSED_BUFFER_SIZE 1024*1024
 19 | 
 20 | 
 21 | /* TextAdapter error codes */
 22 | typedef enum
 23 | {
 24 |     ADAPTER_SUCCESS,
 25 |     ADAPTER_ERROR_SEEK,
 26 |     ADAPTER_ERROR_SEEK_EOF,
 27 |     ADAPTER_ERROR_SEEK_GZIP,
 28 |     ADAPTER_ERROR_SEEK_S3,
 29 |     ADAPTER_ERROR_READ,
 30 |     ADAPTER_ERROR_READ_EOF,
 31 |     ADAPTER_ERROR_READ_GZIP,
 32 |     ADAPTER_ERROR_READ_S3,
 33 |     ADAPTER_ERROR_NO_FIELDS,
 34 |     ADAPTER_ERROR_CONVERT,
 35 |     ADAPTER_ERROR_INDEX,
 36 |     ADAPTER_ERROR_PROCESS_TOKEN,
 37 |     ADAPTER_ERROR_READ_TOKENS,
 38 |     ADAPTER_ERROR_READ_RECORDS,
 39 |     ADAPTER_ERROR_JSON,
 40 |     ADAPTER_ERROR_INVALID_CHAR_CODE,
 41 |     ADAPTER_ERROR_LAST
 42 | } AdapterError;
 43 | 
 44 | 
 45 | typedef enum tokenizer_state
 46 | {
 47 |     DEFAULT_STATE,
 48 |     RECORD_STATE,
 49 |     RECORD_END_STATE,
 50 |     COMMENT_STATE,
 51 |     QUOTE_STATE,
 52 |     QUOTE_END_STATE,
 53 |     PROCESS_STATE,
 54 |     ESCAPE_STATE
 55 | } TokenizerState;
 56 | 
 57 | 
 58 | typedef struct text_adapter_t TextAdapter;
 59 | typedef struct input_data_t InputData;
 60 | 
 61 | 
 62 | /* read function type for reading blocks of text from data source */
 63 | typedef AdapterError (*read_func_ptr)(InputData *input,
 64 |     char *buffer, uint64_t len, uint64_t *num_bytes_read);
 65 | 
 66 | /* seek function type for seeking to position in data source */
 67 | typedef AdapterError (*seek_func_ptr)(InputData *input,
 68 |     uint64_t offset);
 69 | 
 70 | /* cleans up any handles or pointers involved in reading from data source */
 71 | typedef void (*close_func_ptr)(InputData *input);
 72 | 
 73 | /* tokenize function for parsing text buffer and finding fields appropriate
 74 |    converter function should be called for each field that is found */
 75 | typedef AdapterError (*tokenize_func_ptr)(TextAdapter *adapter,
 76 |     uint64_t num_tokens, uint64_t step, char **output,
 77 |     uint64_t *num_tokens_found, int enable_index, uint64_t index_density);
 78 | 
 79 | 
 80 | struct input_data_t
 81 | {
 82 |     void *input;
 83 | 
 84 |     /* retrieves data chunks from data source and stores in buffer */
 85 |     read_func_ptr read;
 86 | 
 87 |     /* seeks to new position in data source */
 88 |     seek_func_ptr seek;
 89 | 
 90 |     /* cleans up any handles or pointers involved in reading from data source */
 91 |     close_func_ptr close;
 92 | 
 93 |     void *compressed_input;
 94 |     
 95 |     char *compressed_prebuffer;
 96 | 
 97 |     /* retrieves and decompresses data chunks from compressed data source
 98 |        and stores in buffer */
 99 |     read_func_ptr read_compressed;
100 | 
101 |     /* seeks to new position in compressed data source */
102 |     seek_func_ptr seek_compressed;
103 | 
104 |     /* Retreive gzip access point from index */
105 |     get_gzip_access_point_func_ptr get_gzip_access_point;
106 | 
107 |     /* number of bytes to skip at beginning of data stream */
108 |     uint64_t header; 
109 | 
110 |     /* number of bytes to skip at end of data stream */
111 |     uint64_t footer;
112 | 
113 |     /* Record where reading is started from after seek */
114 |     uint64_t start_record;
115 | 
116 |     /* Data offset where reading is started from after seek */
117 |     uint64_t start_offset;
118 | 
119 |     /* index of record offsets */
120 |     void *index;
121 | 
122 | };
123 | 
124 | 
125 | typedef struct memmap_input_t
126 | {
127 |     char *data;
128 |     uint64_t size;
129 |     uint64_t position;
130 | } MemMapInput;
131 | 
132 | 
133 | typedef struct gzip_input_t
134 | {
135 |     /* data struct for reading gzipped compressed data */
136 |     z_stream *z;
137 | 
138 |     uint32_t compressed_bytes_processed;
139 |     uint64_t uncompressed_bytes_processed;
140 |     int buffer_refreshed;
141 | 
142 |     /* data struct for reading uncompressed data */
143 |     void *uncompressed_input;
144 | } GzipInput;
145 | 
146 | 
147 | typedef struct json_tokenizer_args_t
148 | {
149 |     struct JSON_checker_struct *jc;
150 | } JsonTokenizerArgs;
151 | 
152 | typedef struct regex_tokenizer_args_t
153 | {
154 |     pcre *pcre_regex;
155 |     struct pcre_extra *extra_regex;
156 | } RegexTokenizerArgs;
157 | 
158 | 
159 | typedef struct text_adapter_buffer_t
160 | {
161 |     char *data;
162 |     uint64_t size;
163 |     uint64_t bytes_processed;
164 |     int eof;
165 | } TextAdapterBuffer;
166 | 
167 | 
168 | typedef struct convert_error_info_t
169 | {
170 |     ConvertError convert_result;
171 |     char *token;
172 |     uint64_t record_num;
173 |     uint64_t field_num;
174 | } ConvertErrorInfo;
175 | 
176 | 
177 | typedef struct text_adapter_t
178 | {
179 |     uint64_t num_records;
180 | 
181 |     char delim_char;
182 |     char comment_char;
183 |     char quote_char;
184 |     char escape_char;
185 | 
186 |     /* Setting this to true will treat a series of whitespace
187 |        as a single delimiter. Otherwise, each whitespace char
188 |        will delimim a single field. */
189 |     int group_whitespace_delims;
190 |     int any_whitespace_as_delim;
191 | 
192 |     int infer_types_mode;
193 | 
194 |     /* If 0, empty lines will be treated as missing fields. Defaults to 1. */
195 |     int skipblanklines;
196 | 
197 |     InputData *input_data;
198 | 
199 |     /* array of field info for each field */
200 |     FieldList *fields;
201 | 
202 |     /* buffer for storing chunks of data from data source to be parsed */
203 |     TextAdapterBuffer buffer;
204 | 
205 |     /* parses tokens in buffer */
206 |     tokenize_func_ptr tokenize;
207 |     void *tokenize_args;
208 | 
209 |     /* index of record offsets */
210 |     void *index;
211 | 
212 |     /* Density of record offsets index. Density value x means every
213 |        x-th record is indexed. */
214 |     uint64_t index_density;
215 | 
216 |     /* function for building additional index info for specific
217 |        data stream type */
218 |     indexer_func_ptr indexer;
219 |     index_lookup_func_ptr index_lookup;
220 |     add_gzip_access_point_func_ptr add_gzip_access_point;
221 | 
222 |     int reset_json_args;
223 | 
224 | } TextAdapter;
225 | 
226 | 
227 | /* Allocate new Recfile struct and set functions */
228 | TextAdapter* open_text_adapter(InputData *input_data);
229 | 
230 | /* Deallocate Recfile struct */
231 | void close_text_adapter(TextAdapter *adapter);
232 | 
233 | /* Seek to specific record in data source */
234 | AdapterError seek_record(TextAdapter *adapter, uint64_t rec_num);
235 | 
236 | /* Read specified number of records from data source, starting at current
237 |    position. Fields in records will be converted to data type and stored in
238 |    output buffer. Output buffer should be big enough to store
239 |    requested records. */
240 | AdapterError read_records(TextAdapter *adapter, uint64_t num_records,
241 |     uint64_t step, char *output, uint64_t *num_records_found);
242 | 
243 | /* default build index function */
244 | AdapterError build_index(TextAdapter *adapter);
245 | 
246 | /* initialize default index info */
247 | void clear_gzip_index(TextAdapter *adapter);
248 | 
249 | /* build index function for gzip files */
250 | AdapterError build_gzip_index(TextAdapter *adapter);
251 | 
252 | /* default tokenize function based on delimiter */
253 | AdapterError delim_tokenizer(TextAdapter *adapter, uint64_t num_tokens,
254 |     uint64_t step, char **output, uint64_t *num_tokens_found,
255 |     int enable_index, uint64_t index_density);
256 | 
257 | AdapterError json_tokenizer(TextAdapter *adapter, uint64_t num_tokens,
258 |     uint64_t step, char **output, uint64_t *num_tokens_found,
259 |     int enable_index, uint64_t index_density);
260 | 
261 | AdapterError json_record_tokenizer(TextAdapter *adapter, uint64_t num_tokens,
262 |     uint64_t step, char **output, uint64_t *num_tokens_found,
263 |     int enable_index, uint64_t index_density);
264 | 
265 | /* regular expression tokenize function */
266 | AdapterError regex_tokenizer(TextAdapter *adapter, uint64_t num_tokens,
267 |     uint64_t step, char **output, uint64_t *num_tokens_found,
268 |     int enable_index, uint64_t index_density);
269 | 
270 | /* tokenize function based on predefined field widths */
271 | AdapterError fixed_width_tokenizer(TextAdapter *adapter, uint64_t num_tokens,
272 |     uint64_t step, char **output, uint64_t *num_tokens_found,
273 |     int enable_index, uint64_t index_density);
274 | 
275 | AdapterError record_tokenizer(TextAdapter *adapter, uint64_t num_tokens,
276 |     uint64_t step, char **output, uint64_t *num_tokens_found,
277 |     int enable_index, uint64_t index_density);
278 | 
279 | AdapterError line_tokenizer(TextAdapter *adapter, uint64_t num_tokens,
280 |     uint64_t step, char **output, uint64_t *num_tokens_found,
281 |     int enable_index, uint64_t index_density);
282 | 
283 | ConvertErrorInfo get_error_info(void);
284 | 
285 | 
286 | #endif
287 | 


--------------------------------------------------------------------------------
/textadapter/examples/README:
--------------------------------------------------------------------------------
1 | To run examples, first generate example data using:
2 | 
3 | cd ../tests
4 | python generate.py 500  # number of records
5 | 


--------------------------------------------------------------------------------
/textadapter/examples/basic.py:
--------------------------------------------------------------------------------
 1 | import textadapter
 2 | 
 3 | adapter = textadapter.CSVTextAdapter('../tests/data/ints', delimiter=',', field_names=False)
 4 | 
 5 | # Set dtype for each field in record
 6 | adapter.set_field_types({0:'u4', 1:'u8', 2:'f4', 3:'f8', 4:'S10'})
 7 | 
 8 | # Read all records
 9 | print(adapter[:])
10 | 
11 | # Read first ten records
12 | print(adapter[0:10])
13 | 
14 | # Change dtype; retrieve only 1st and 5th field
15 | adapter.set_field_types({0:'u4', 4:'u4'})
16 | 
17 | # Read every other record
18 | print(adapter[::2])
19 | 
20 | 


--------------------------------------------------------------------------------
/textadapter/examples/converter.py:
--------------------------------------------------------------------------------
 1 | import textadapter
 2 | 
 3 | adapter = textadapter.CSVTextAdapter('../tests/data/ints', delimiter=',', field_names=False)
 4 | 
 5 | # Set dtype for each field in record
 6 | adapter.set_field_types({0:'u4', 1:'u8', 2:'f4', 3:'f8', 4:'S10'})
 7 | 
 8 | # Override default converter for first field
 9 | adapter.set_converter(0, lambda x: int(x)*2)
10 | 
11 | # Read first 10 records
12 | print(adapter[:10])
13 | 
14 | 


--------------------------------------------------------------------------------
/textadapter/examples/fixed_width.py:
--------------------------------------------------------------------------------
 1 | import textadapter
 2 | 
 3 | adapter = textadapter.FixedWidthTextAdapter('../tests/data/fixedwidths', [2,3,4,5,6])
 4 | 
 5 | # Set dtype for each field in record
 6 | adapter.set_field_types(dict(zip(range(5), ['u4']*5)))
 7 | 
 8 | # Read all records
 9 | print(adapter[:])
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/textadapter/examples/gzip_ints.py:
--------------------------------------------------------------------------------
 1 | import textadapter
 2 | 
 3 | adapter = textadapter.CSVTextAdapter('../tests/data/ints.gz', delimiter=',', compression='gzip', field_names=False)
 4 | 
 5 | # Set dtype for each field in record
 6 | adapter.set_field_types({0:'u4', 1:'u8', 2:'f4', 3:'f8', 4:'S10'})
 7 | 
 8 | print('\n\n!!! INVESTIGATE !!!\n\n')
 9 | 
10 | # adapter.size is unknown at this point...
11 | print('Before we read any records, try adapter.size...')
12 | try:
13 |     sz = adapter.size
14 | except AttributeError as err:
15 |     print('AttributeError:', err)
16 | 
17 | # Read first record
18 | print('Read first record\n', adapter[0])
19 | 
20 | # But now adapter.size IS known!
21 | print('After we read a record...')
22 | print('adapter.size', adapter.size)
23 | 
24 | # Read last record
25 | print('\n\nNow we attempt to read the LAST record...')
26 | print('adapter[-1] == adapter[0]?!? == ', adapter[-1])
27 | print('adapter[1], as should be == ', adapter[1])
28 | 
29 | print('After we read ALL records...')
30 | records = adapter[:]
31 | print('adapter[-1] == ', adapter[-1])
32 | 
33 | print('\n\nFollowing code seems outdated. Remove it?') 
34 | try:
35 |     # build index of records and save index to file
36 |     indexArray, gzipIndexArray = adapter.create_index()
37 |     # load index from file
38 |     adapter.set_index(indexArray, gzipIndexArray)
39 | except TypeError as err:
40 |     raise TypeError(err)
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/textadapter/examples/missing_values.py:
--------------------------------------------------------------------------------
 1 | import textadapter
 2 | 
 3 | adapter = textadapter.CSVTextAdapter('../tests/data/missingvalues', delimiter=',', field_names=False)
 4 | 
 5 | # Set dtype for each field in record
 6 | adapter.set_field_types({0:'u4', 1:'u4', 2:'u4', 3:'u4', 4:'u4'})
 7 | 
 8 | # Define list of strings for each field that represent missing values
 9 | adapter.set_missing_values({0:['NA', 'NaN'], 4:['xx','inf']})
10 | 
11 | # Set fill value for missing values in each field
12 | adapter.set_fill_values({0:99, 4:999})
13 | 
14 | # Read all records
15 | print(adapter[:])
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/textadapter/examples/regex.py:
--------------------------------------------------------------------------------
 1 | import textadapter
 2 | 
 3 | adapter = textadapter.RegexTextAdapter('../tests/data/ints', '([0-9]*),([0-9]*),([0-9]*),([0-9]*),([0-9]*)')
 4 | 
 5 | # Set dtype for each group in regular expression.
 6 | # Any groups without a dtype defined for it will not be
 7 | # stored in numpy array
 8 | adapter.set_field_types(dict(zip(range(5), ['u4']*5)))
 9 | 
10 | # Read all records
11 | print(adapter[:])
12 | 
13 | 


--------------------------------------------------------------------------------
/textadapter/lib/Converters.pyx:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | cimport numpy
  3 | 
  4 | 
  5 | cdef ConvertError str2str_object_converter(void *input_str, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg):
  6 |     """
  7 |     Wrapper function for calling string object converter function
  8 |     from low level C api. This is used to convert c strings to python
  9 |     string objects.
 10 | 
 11 |     Arguments:
 12 |     void *input - pointer to value to convert
 13 |     uint32_t input_len - length in bytes of input value
 14 |     void *output - pointer to memory chunk to store converted value
 15 |     uint32_t output_len - size of output memory chunk
 16 |     void *arg - pointer to python callable object which does the actual converting
 17 | 
 18 |     Returns:
 19 |     converted value as a python string object
 20 |     """
 21 |     cdef ConvertError result = CONVERT_ERROR_OBJECT_CONVERTER
 22 |     cdef PyObject **object_ptr
 23 |     object_ptr = <PyObject**>output
 24 |     cdef kh_string_t *kh_string_table = <kh_string_t*>arg
 25 |     cdef int ret
 26 |     cdef khiter_t it
 27 |     cdef object temp
 28 |     cdef char *input_str_copy
 29 | 
 30 |     try:
 31 |         # Convert c string to Python string object and store in output array
 32 |         if object_ptr != NULL:
 33 | 
 34 |             # string object hash table exists
 35 |             if kh_string_table != NULL:
 36 | 
 37 |                 # Look for existing string object
 38 |                 it = kh_get_string(kh_string_table, <char*>input_str)
 39 | 
 40 |                 # String object doesn't exist, so create and store in output
 41 |                 # array and hash table
 42 |                 if it == kh_string_table.n_buckets:
 43 |                     temp = (<char*>input_str)[0:input_len].decode(config['encoding'])
 44 |                     object_ptr[0] = <PyObject*>temp
 45 |                     Py_INCREF(<object>object_ptr[0])
 46 |                     input_str_copy = <char*>malloc(input_len+1)
 47 |                     strncpy(input_str_copy, <char*>input_str, input_len+1)
 48 |                     it = kh_put_string(kh_string_table, <char*>input_str_copy, &ret)
 49 |                     kh_string_table.vals[it] = <PyObject*> object_ptr[0]
 50 | 
 51 |                 # String object exists, so store existing object in array
 52 |                 else:
 53 |                     object_ptr[0] = kh_string_table.vals[it]
 54 |                     Py_INCREF(<object>object_ptr[0])
 55 | 
 56 |             # No string object hash table exists; just convert and store
 57 |             else:
 58 |                 temp = (<char*>input_str)[0:input_len].decode(config['encoding'])
 59 |                 object_ptr[0] = <PyObject*>temp
 60 |                 Py_INCREF(<object>object_ptr[0])
 61 | 
 62 |         # Try converting c string to Python string object (for type inference)
 63 |         else:
 64 |             temp = (<char*>input_str)[0:input_len].decode(config['encoding'])
 65 | 
 66 |         result = CONVERT_SUCCESS
 67 | 
 68 |     except Exception as e:
 69 |         result = CONVERT_ERROR_OBJECT_CONVERTER
 70 |  
 71 |     return result
 72 | 
 73 | 
 74 | cdef ConvertError str2datetime_object_converter(void *input_str, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg):
 75 |     """
 76 |     Wrapper function for calling string object converter function
 77 |     from low level C api. This is used to convert c strings to python
 78 |     string objects.
 79 | 
 80 |     Arguments:
 81 |     void *input - pointer to value to convert
 82 |     uint32_t input_len - length in bytes of input value
 83 |     void *output - pointer to memory chunk to store converted value
 84 |     uint32_t output_len - size of output memory chunk
 85 |     void *arg - pointer to python callable object which does the actual converting
 86 | 
 87 |     Returns:
 88 |     converted value as a python string object
 89 |     """
 90 |     cdef ConvertError result = CONVERT_ERROR_OBJECT_CONVERTER
 91 |     cdef PyObject **object_ptr
 92 |     object_ptr = <PyObject**>output
 93 |     cdef object temp
 94 | 
 95 |     try:
 96 |         if object_ptr != NULL:
 97 |             temp = str((<char*>input_str)[0:input_len].encode())
 98 |             object_ptr[0] = <PyObject*>temp
 99 |             Py_INCREF(<object>object_ptr[0])
100 | 
101 |         result = CONVERT_SUCCESS
102 |     except Exception as e:
103 |         result = CONVERT_ERROR_OBJECT_CONVERTER
104 |  
105 |     return result
106 | 
107 | 
108 | cdef ConvertError str2datetime_converter(void *input, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg):
109 |     """
110 |     Wrapper function for calling numpy datetime converter function
111 |     from low level C api.
112 | 
113 |     Arguments:
114 |     void *input - pointer to value to convert
115 |     uint32_t input_len - length in bytes of input value
116 |     void *output - pointer to memory chunk to store converted value
117 |     uint32_t output_len - size of output memory chunk
118 |     void *arg - pointer to python callable object which does the actual converting
119 | 
120 |     Returns:
121 |     Convert result
122 |     """
123 |     cdef ConvertError result = CONVERT_ERROR_OBJECT_CONVERTER
124 |     cdef numpy.npy_intp dims[1]
125 |     cdef char *temp = <char*>input
126 | 
127 |     if arg == NULL:
128 |         return CONVERT_ERROR_OBJECT_CONVERTER
129 | 
130 |     try:
131 |         dtype = <object>arg
132 |         value = dtype.type(<object>temp)
133 |         if output != NULL:
134 |             dims[0] = 1
135 |             array = numpy.PyArray_SimpleNewFromData(1, dims, value.dtype.num, output)
136 |             array.dtype = numpy.dtype(dtype)
137 |             array[0] = value
138 |         result = CONVERT_SUCCESS
139 |     except Exception as e:
140 |         result = CONVERT_ERROR_OBJECT_CONVERTER
141 | 
142 |     return result
143 | 
144 | 
145 | cdef ConvertError python_converter(void *input, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg):
146 |     """
147 |     Wrapper function for calling python converter function from low level C api.
148 | 
149 |     Arguments:
150 |     void *input - pointer to value to convert
151 |     uint32_t input_len - length in bytes of input value
152 |     void *output - pointer to memory chunk to store converted value
153 |     uint32_t output_len - size of output memory chunk
154 |     void *arg - pointer to python callable object which does the actual converting
155 | 
156 |     Returns:
157 |     Convert result
158 |     """
159 |     cdef numpy.npy_intp dims[1]
160 |     cdef char *temp = <char*>calloc(1, input_len+1)
161 |     cdef bytes py_string
162 |     cdef ConvertError result = CONVERT_ERROR_USER_CONVERTER
163 |     # "input" contains a long string (char*). We only copy "input_len" and make
164 |     # sure that there is a null byte at the end (by using calloc with
165 |     # input_len+1 above)
166 |     memcpy(temp, input, input_len)
167 | 
168 |     try:
169 |         # Convert "temp" to a Python string (bytes in fact)
170 |         py_string = temp
171 |         # Convert "arg" to Python callable:
172 |         func = <object>arg
173 |         # call python callable object to convert input value
174 |         new_value = func(py_string)
175 | 
176 |         if isinstance(new_value, numpy.generic):
177 |             data = bytes(new_value.data)
178 |             if output != NULL:
179 |                 memcpy(output, <char *>data, output_len)
180 |             result = CONVERT_SUCCESS
181 |         # JNB: not sure if there is a better way to store objects in numpy object array
182 |         elif isinstance(new_value, object):
183 |             if output != NULL:
184 |                 dims[0] = 1
185 |                 array = numpy.PyArray_SimpleNewFromData(1, dims, numpy.NPY_OBJECT, output)
186 |                 array[0] = new_value
187 |             result = CONVERT_SUCCESS
188 |         else:
189 |             result = CONVERT_ERROR_USER_CONVERTER
190 | 
191 |     except:
192 |         result = CONVERT_ERROR_USER_CONVERTER
193 |     finally:
194 |         free(temp)
195 | 
196 |     return result
197 | 
198 | 
199 | ctypedef uint64_t (*uint_numba_func_ptr)(char *)
200 | ctypedef int64_t (*int_numba_func_ptr)(char *)
201 | ctypedef double (*float_numba_func_ptr)(char *)
202 | ctypedef PyObject* (*object_numba_func_ptr)(char *)
203 | ctypedef int64_t (*datetime_numba_func_ptr)(char *)
204 | 
205 | cdef ConvertError str2uint_numba_converter(void *input, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg):
206 |     cdef uint_numba_func_ptr numba_func = <uint_numba_func_ptr><long>arg
207 |     cdef uint64_t *output_ptr64 = <uint64_t*>output
208 |     cdef uint32_t *output_ptr32 = <uint32_t*>output
209 |     cdef uint16_t *output_ptr16 = <uint16_t*>output
210 |     cdef uint8_t *output_ptr8 = <uint8_t*>output
211 |     cdef uint64_t value
212 | 
213 |     try:
214 |         if output_len == 8:
215 |             value = <uint64_t>numba_func(<char*>input)
216 |             if output != NULL:
217 |                 output_ptr64[0] = value
218 |         elif output_len == 4:
219 |             value = <uint32_t>numba_func(<char*>input)
220 |             if value > 0xffffffff:
221 |                 return CONVERT_ERROR_NUMBA
222 |             if output != NULL:
223 |                 output_ptr32[0] = value
224 |         elif output_len == 2:
225 |             value = <uint16_t>numba_func(<char*>input)
226 |             if value > 0xffff:
227 |                 return CONVERT_ERROR_NUMBA
228 |             if output != NULL:
229 |                 output_ptr16[0] = value
230 |         elif output_len == 1:
231 |             value = <uint8_t>numba_func(<char*>input)
232 |             if value > 0xff:
233 |                 return CONVERT_ERROR_NUMBA
234 |             if output != NULL:
235 |                 output_ptr8[0] = value
236 |         else:
237 |             return CONVERT_ERROR_NUMBA
238 |     except:
239 |         return CONVERT_ERROR_NUMBA
240 |     return CONVERT_SUCCESS
241 | 
242 | cdef ConvertError str2float_numba_converter(void *input, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg):
243 |     cdef float_numba_func_ptr numba_func = <float_numba_func_ptr><long>arg
244 |     cdef float *output_ptr32 = <float*>output
245 |     cdef double *output_ptr64 = <double*>output
246 |     cdef double value
247 | 
248 |     try:
249 |         if output_len == 4:
250 |             value = <float>numba_func(<char*>input)
251 |             if output != NULL:
252 |                 output_ptr32[0] = value
253 |         elif output_len == 8:
254 |             value = <double>numba_func(<char*>input)
255 |             if output != NULL:
256 |                 output_ptr64[0] = value
257 |         else:
258 |             return CONVERT_ERROR_NUMBA
259 |     except:
260 |         return CONVERT_ERROR_NUMBA
261 |     return CONVERT_SUCCESS
262 | 
263 | cdef ConvertError str2datetime_numba_converter(void *input, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg):
264 |     cdef datetime_numba_func_ptr numba_func = <datetime_numba_func_ptr><long>arg
265 |     cdef int64_t *output_ptr = <int64_t*>output
266 |     cdef int64_t value
267 | 
268 |     try:
269 |         if output_len == 8:
270 |             value = <int64_t>numba_func(<char*>input)
271 |             if output != NULL:
272 |                 output_ptr[0] = value
273 |         else:
274 |             return CONVERT_ERROR_NUMBA
275 |     except:
276 |         return CONVERT_ERROR_NUMBA
277 |     return CONVERT_SUCCESS
278 | 
279 | cdef ConvertError str2object_numba_converter(void *input, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg):
280 |     cdef object_numba_func_ptr numba_func = <object_numba_func_ptr><long>arg
281 |     cdef PyObject **output_ptr = <PyObject**>output
282 |     cdef object value
283 | 
284 |     try:
285 |         value = <object>numba_func(<char*>input)
286 |         if output != NULL:
287 |             output_ptr[0] = <PyObject*>value
288 |     except:
289 |         return CONVERT_ERROR_NUMBA
290 | 
291 |     return CONVERT_SUCCESS
292 | 
293 | 
294 | 
295 | 


--------------------------------------------------------------------------------
/textadapter/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContinuumIO/TextAdapter/53138c2277cdfcf32e127251313d4f77f81050aa/textadapter/lib/__init__.py


--------------------------------------------------------------------------------
/textadapter/lib/_stdint.h:
--------------------------------------------------------------------------------
 1 | #ifndef STDINT_H
 2 | #define STDINT_H
 3 | 
 4 | 
 5 | #if defined(_MSC_VER) && _MSC_VER < 1600
 6 | /* Visual Studio before 2010 didn't have stdint.h */
 7 | #include <limits.h>
 8 | typedef signed char      int8_t;
 9 | typedef short            int16_t;
10 | typedef int              int32_t;
11 | typedef __int64          int64_t;
12 | typedef unsigned char    uint8_t;
13 | typedef unsigned short   uint16_t;
14 | typedef unsigned int     uint32_t;
15 | typedef unsigned __int64 uint64_t;
16 | #define INT8_MIN SCHAR_MIN
17 | #define INT8_MAX SCHAR_MAX
18 | #define INT16_MIN SHRT_MIN
19 | #define INT16_MAX SHRT_MAX
20 | #define INT32_MIN INT_MIN
21 | #define INT32_MAX INT_MAX
22 | #define UINT8_MAX UCHAR_MAX
23 | #define UINT16_MAX USHRT_MAX
24 | #define UINT32_MAX UINT_MAX
25 | #define UINT64_MAX _UI64_MAX
26 | #else
27 | #include <stdint.h>
28 | #endif
29 | 
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/textadapter/lib/converter_functions.h:
--------------------------------------------------------------------------------
  1 | #ifndef CONVERTERS_H
  2 | #define CONVERTERS_H
  3 | 
  4 | #if defined(_MSC_VER) && _MSC_VER < 1600
  5 | /* Visual Studio before 2010 didn't have stdint.h */
  6 | typedef signed char      int8_t;
  7 | typedef short            int16_t;
  8 | typedef int              int32_t;
  9 | typedef __int64          int64_t;
 10 | typedef unsigned char    uint8_t;
 11 | typedef unsigned short   uint16_t;
 12 | typedef unsigned int     uint32_t;
 13 | typedef unsigned __int64 uint64_t;
 14 | #define INT8_MIN SCHAR_MIN
 15 | #define INT8_MAX SCHAR_MAX
 16 | #define INT16_MIN SHRT_MIN
 17 | #define INT16_MAX SHRT_MAX
 18 | #define INT32_MIN INT_MIN
 19 | #define INT32_MAX INT_MAX
 20 | #define UINT8_MAX UCHAR_MAX
 21 | #define UINT16_MAX USHRT_MAX
 22 | #define UINT32_MAX UINT_MAX
 23 | #else
 24 | #include <stdint.h>
 25 | #endif
 26 | 
 27 | #include <string.h>
 28 | 
 29 | 
 30 | typedef enum
 31 | {
 32 |     CONVERT_SUCCESS,
 33 |     CONVERT_SUCCESS_TYPE_CHANGED,
 34 |     CONVERT_ERROR,
 35 |     CONVERT_ERROR_OVERFLOW,
 36 |     CONVERT_ERROR_TRUNCATE,
 37 |     CONVERT_ERROR_INPUT_TYPE,
 38 |     CONVERT_ERROR_INPUT_SIZE,
 39 |     CONVERT_ERROR_OUTPUT_SIZE,
 40 |     CONVERT_ERROR_INPUT_STRING,
 41 |     CONVERT_ERROR_USER_CONVERTER,
 42 |     CONVERT_ERROR_OBJECT_CONVERTER,
 43 |     CONVERT_ERROR_NUMBA,
 44 |     CONVERT_ERROR_LAST
 45 | } ConvertError;
 46 | 
 47 | 
 48 | typedef enum
 49 | {
 50 |     UINT_CONVERTER_FUNC,
 51 |     INT_CONVERTER_FUNC,
 52 |     FLOAT_CONVERTER_FUNC,
 53 |     STRING_CONVERTER_FUNC,
 54 |     STRING_OBJECT_CONVERTER_FUNC,
 55 |     NUM_CONVERTER_FUNCS
 56 | } DefaultConverterFuncs;
 57 | 
 58 | 
 59 | /* 
 60 |  * converter function signature for functions that convert strings to a specific
 61 |  * data type and stores in output buffer
 62 |  * Inputs:
 63 |  *   input: null terminated C string representing value to convert
 64 |  *   input_len: length of input (redundant but input string originally was not
 65 |  *              null terminated
 66 |  *   input_type: indicates type of input (not used by every converter func)
 67 |  *   output: pointer to memory block where output value should be stored
 68 |  *   output_len: length of output reserved for output value
 69 |  *   arg: optional arg value/struct specific to each converter func
 70 |  * Output:
 71 |  *   error code defined above in ConvertError enum
 72 |  */
 73 | typedef ConvertError (*converter_func_ptr)(const char *input,
 74 |                                            uint32_t input_len,
 75 |                                            int input_type,
 76 |                                            void *output,
 77 |                                            uint32_t output_len,
 78 |                                            void *arg);
 79 | 
 80 | /* 
 81 |  * The following conversion functions follow conversion function signature
 82 |  * defined above
 83 |  */
 84 | 
 85 | /* Convert null terminated C string to signed int */
 86 | ConvertError str2int_converter(const char *input, uint32_t input_len,
 87 |     int input_type, void *output, uint32_t output_len, void *arg);
 88 | /* Convert null terminated C string to unsigned int */
 89 | ConvertError str2uint_converter(const char *input, uint32_t input_len,
 90 |     int input_type, void *output, uint32_t output_len, void *arg);
 91 | /* Convert null terminated C string to float/double */
 92 | ConvertError str2float_converter(const char *input, uint32_t input_len,
 93 |     int input_type, void *output, uint32_t output_len, void *arg);
 94 | /* Copy null terminated C string to output of possibly different length */
 95 | ConvertError str2str_converter(void *input, uint32_t input_len,
 96 |     int input_type, void *output, uint32_t output_len, void *arg);
 97 | /* Convert null terminated C string to complex number */
 98 | ConvertError str2complex_converter(void *input, uint32_t input_len,
 99 |     int input_type, void *output, uint32_t output_len, void *arg);
100 | 
101 | 
102 | /*
103 |  * Extract signed int of various sizes from memory block and cast to
104 |  * signed int64 if needed. Input integer size is specified by input_len argument.
105 |  */
106 | ConvertError get_int_value(void *input, uint32_t input_len, int64_t *value);
107 | 
108 | /*
109 |  * Extract unsigned int of various sizes from memory block and cast to
110 |  * unsigned int64 if needed. Input integer size is specified by input_len argument.
111 |  */
112 | ConvertError get_uint_value(void *input, uint32_t input_len, uint64_t *value);
113 | 
114 | /*
115 |  * Extract double/float from from memory block and cast to
116 |  * double if needed. Input floating point size is specified by input_len argument.
117 |  */
118 | ConvertError get_float_value(void *input, uint32_t input_len, double *value);
119 | 
120 | /*
121 |  * Save signed int64 value to memory block, casting to appropriate output integer
122 |  * size if needed. Output integer size is specified by output_len arg.
123 |  */
124 | ConvertError put_int_value(void *output, uint32_t output_len, int64_t value);
125 | 
126 | /*
127 |  * Save unsigned int64 value to memory block, casting to appropriate output integer
128 |  * size if needed. Output integer size is specified by output_len arg.
129 |  */
130 | ConvertError put_uint_value(void *output, uint32_t output_len, uint64_t value);
131 | 
132 | /*
133 |  * Save double/float value to memory block, casting to appropriate output floating
134 |  * point size if needed. Output float size is specified by output_len arg.
135 |  */
136 | ConvertError put_float_value(void *output, uint32_t output_len, double value);
137 | 
138 | #endif
139 | 


--------------------------------------------------------------------------------
/textadapter/lib/errors.py:
--------------------------------------------------------------------------------
 1 | class AdapterException(Exception):
 2 |     """Generic adapter exception for reporting reading, parsing, and
 3 |     converting issues. All adapter exceptions have following instance
 4 |     variables in common:
 5 | 
 6 |     * `record` - record reference where error occured
 7 |     * `field` - field reference where error occured
 8 |     """
 9 |     def __init__(self, message=None):
10 |     	super(AdapterException, self).__init__(message)
11 | 
12 |     	self.record = None
13 |     	self.field = None
14 | 
15 | class SourceError(AdapterException):
16 | 	"""Raised on error while reading or talking to a data source. It might be
17 | 	seek or read error for file sources or broken connection for database
18 | 	sources."""
19 | 	pass
20 | 		
21 | class SourceNotFoundError(SourceError):
22 | 	"""Raised when data source (file, table, ...) was not found."""
23 | 	def __init__(self, message=None, source=None):
24 | 		super(SourceNotFoundError, self).__init__(message)
25 | 		self.source = source
26 | 
27 | class ConfigurationError(AdapterException):
28 | 	"""Raised when objects are mis-configured."""
29 | 	pass
30 | 
31 | class NoSuchFieldError(AdapterException):
32 | 	"""Raised when non-existent field is referenced, either by name or position index."""
33 | 	pass
34 | 
35 | class DataIndexError(AdapterException):
36 | 	"""Raised for example when a record is not found in record index in indexed
37 | 	data source."""
38 | 	pass
39 | 
40 | class DataTypeError(AdapterException):
41 | 	"""Raised on data type mis-match or when type conversion fails."""
42 | 	pass
43 | 
44 | class ParserError(AdapterException):
45 | 	"""Raised when there is problem with parsing source data, for example in
46 | 	broken text file with CSV. The `token` instance variable contains problematic
47 | 	token that was not parsed correctly."""
48 | 	def __init__(self, message=None, token=None):
49 | 		super(ParserError, self).__init__(message)
50 | 		self.token = token
51 | 
52 | class ArgumentError(AdapterException):
53 | 	"""Invalid arguments used in calling textadapter functions/methods"""
54 | 	pass
55 | 
56 | class InternalInconsistencyError(AdapterException):
57 | 	"""Raised when the library goes into a state that is not expected to
58 | 	happen."""
59 | 	pass
60 | 
61 | class AdapterIndexError(AdapterException):
62 | 	""" Raised when record number or slice is invalid """
63 | 	pass
64 | 
65 | 


--------------------------------------------------------------------------------
/textadapter/lib/field_info.c:
--------------------------------------------------------------------------------
  1 | #include "field_info.h"
  2 | #include <stdlib.h>
  3 | #include <assert.h>
  4 | #include <string.h>
  5 | 
  6 | 
  7 | /* Set the number of fields in input data. This  */
  8 | void set_num_fields(FieldList *fields, uint32_t num_fields)
  9 | {
 10 |     uint32_t i;
 11 | 
 12 |     #ifdef DEBUG_ADAPTER
 13 |     printf("set_num_fields() setting number of fields to %u\n", num_fields);
 14 |     #endif
 15 | 
 16 |     if (fields == NULL)
 17 |         return;
 18 | 
 19 |     if (fields->field_info != NULL)
 20 |     {
 21 |         clear_fields(fields);
 22 |     }
 23 | 
 24 |     if (num_fields > 0)
 25 |         fields->field_info = (FieldInfo*)calloc(num_fields, sizeof(FieldInfo));
 26 | 
 27 |     fields->num_fields = num_fields;
 28 | 
 29 |     for (i = 0; i < num_fields; i++)
 30 |     {
 31 |         fields->field_info[i].infer_type = 1;
 32 |     }
 33 | }
 34 | 
 35 | /* Initialize infer_type flag in each field to 1 */
 36 | void init_infer_types(FieldList *fields)
 37 | {
 38 |     uint32_t i;
 39 |     for(i = 0; i < fields->num_fields; i++)
 40 |     {
 41 |         fields->field_info[i].infer_type = 1;
 42 |     }
 43 | }
 44 | 
 45 | /* Initialize missing value struct */
 46 | void init_missing_values(FieldList *fields, char *field_name,
 47 |     uint32_t field_num, uint32_t num_missing_values)
 48 | {
 49 |     MissingValues *missing_values;
 50 | 
 51 |     if (fields == NULL)
 52 |         return;
 53 | 
 54 |     if (field_num >= fields->num_fields)
 55 |         return;
 56 | 
 57 |     missing_values = &fields->field_info[field_num].missing_values;
 58 | 
 59 |     clear_missing_values(missing_values);
 60 | 
 61 |     missing_values->num_missing_values = num_missing_values;
 62 |     missing_values->missing_value_lens =
 63 |         calloc(num_missing_values, sizeof(uint32_t));
 64 |     missing_values->missing_values =
 65 |         calloc(num_missing_values, sizeof(char *));
 66 | }
 67 | 
 68 | 
 69 | /* Add missing value string for the specified field */
 70 | void add_missing_value(FieldList *fields, char *field_name,
 71 |     uint32_t field_num, char *missing_value, uint32_t missing_value_len)
 72 | {
 73 |     MissingValues *missing_values;
 74 |     uint32_t i;
 75 | 
 76 |     if (fields == NULL)
 77 |         return;
 78 | 
 79 |     if (field_num >= fields->num_fields)
 80 |         return;
 81 | 
 82 |     missing_values = &fields->field_info[field_num].missing_values;
 83 | 
 84 |     /* Find first empty entry in missing values array to store missing
 85 |        value string */
 86 |     i = 0;
 87 |     while (i < missing_values->num_missing_values &&
 88 |            missing_values->missing_values[i] > 0)
 89 |     {
 90 |         i++;
 91 |     }
 92 | 
 93 |     missing_values->missing_values[i] =
 94 |         calloc(missing_value_len + 1, sizeof(char));
 95 |     strncpy(missing_values->missing_values[i], missing_value, missing_value_len);
 96 |     missing_values->missing_value_lens[i] = missing_value_len;
 97 | }
 98 | 
 99 | 
100 | /* Set pointer to fill value for specified field. Positive valeu for
101 |    'loose' argument enables fill value to be used when token for this
102 |    field cannot be converted. */
103 | void set_fill_value(FieldList *fields, char *field_name,
104 |     uint32_t field_num, void *new_fill_value, uint32_t fill_value_len, int loose)
105 | {
106 |     FillValue *fill_value;
107 | 
108 |     if (fields == NULL)
109 |         return;
110 | 
111 |     if (field_num >= fields->num_fields)
112 |         return;
113 | 
114 |     fill_value = &fields->field_info[field_num].fill_value;
115 | 
116 |     if (new_fill_value == NULL)
117 |     {
118 |         clear_fill_value(fill_value);
119 |     }
120 |     else
121 |     {
122 |         fill_value->fill_value = calloc(1, fill_value_len);
123 |         memcpy(fill_value->fill_value, new_fill_value, fill_value_len);
124 |         fill_value->loose = loose;
125 |     }
126 | }
127 | 
128 | 
129 | uint32_t get_field_size(FieldList *fields, char *field_name, uint32_t field_num)
130 | {
131 |     uint32_t i;
132 | 
133 |     if (fields == NULL)
134 |         return 0;
135 | 
136 |     if (field_name != NULL)
137 |     {
138 |         i = 0;
139 |         while (i < fields->num_fields)
140 |         {
141 |             if (strcpy(fields->field_info[i].name, field_name))
142 |             {
143 |                 return fields->field_info[i].output_field_size;
144 |             }
145 |             i++;
146 |         }
147 | 
148 |         return 0;
149 |     }
150 |     else
151 |     {
152 |         return fields->field_info[field_num].output_field_size;
153 |     }
154 | }
155 | 
156 | 
157 | uint32_t get_output_record_size(FieldList *fields)
158 | {
159 |     uint32_t i;
160 |     uint32_t rec_size;
161 | 
162 |     if (fields == NULL)
163 |         return 0;
164 | 
165 |     rec_size = 0;
166 | 
167 |     for (i = 0; i < fields->num_fields; i++)
168 |     {
169 |         if (fields->field_info[i].converter != NULL)
170 |         {
171 |             rec_size += fields->field_info[i].output_field_size;
172 |         }
173 |     }
174 | 
175 |     return rec_size;
176 | }
177 | 
178 | 
179 | 
180 | /* Deallocate missing value strings */
181 | void clear_missing_values(MissingValues *missing_values)
182 | {
183 |     uint32_t i;
184 | 
185 |     assert(missing_values != NULL);
186 | 
187 |     if (missing_values->missing_values != NULL)
188 |     {
189 |         for (i = 0; i < missing_values->num_missing_values; i++)
190 |         {
191 |             if (missing_values->missing_values[i] != NULL)
192 |                 free(missing_values->missing_values[i]);
193 |         }
194 | 
195 |         free(missing_values->missing_values);
196 |         missing_values->missing_values = NULL;
197 |     }
198 | 
199 |     if (missing_values->missing_value_lens != NULL)
200 |     {
201 |         free(missing_values->missing_value_lens);
202 |         missing_values->missing_value_lens = NULL;
203 |     }
204 | 
205 |     missing_values->num_missing_values = 0;
206 | }
207 | 
208 | 
209 | /* Deallocate pointer to fill value for specified field */
210 | void clear_fill_value(FillValue *fill_value)
211 | {
212 |     assert(fill_value != NULL);
213 | 
214 |     if (fill_value->fill_value != NULL)
215 |     {
216 |         free(fill_value->fill_value);
217 |         fill_value->fill_value = NULL;
218 |     }
219 | }
220 | 
221 | 
222 | void clear_fields(FieldList *fields)
223 | {
224 |     uint32_t i;
225 | 
226 |     for (i = 0; i < fields->num_fields; i++)
227 |     {
228 |         if (fields->field_info[i].name != NULL)
229 |         {
230 |             free(fields->field_info[i].name);
231 |         }
232 |         fields->field_info[i].name = NULL;
233 | 
234 |         fields->field_info[i].converter = NULL;
235 |         fields->field_info[i].converter_arg = NULL;
236 | 
237 |         clear_missing_values(&fields->field_info[i].missing_values);
238 |         clear_fill_value(&fields->field_info[i].fill_value);
239 | 
240 |         fields->field_info[i].output_field_size = 0;
241 |         fields->field_info[i].input_field_width = 0;
242 |     }
243 |     
244 |     free(fields->field_info);
245 | }
246 | 
247 | 
248 | /* Set fixed field width for specified field */
249 | void set_field_width(FieldList *fields, uint32_t field, uint32_t width)
250 | {
251 |     if (fields == NULL)
252 |         return;
253 | 
254 |     if (field >= fields->num_fields)
255 |         return;
256 |      
257 |     fields->field_info[field].input_field_width = width;
258 | }
259 | 
260 | 
261 | void reset_converters(FieldList *fields)
262 | {
263 |     uint32_t field;
264 | 
265 |     if (fields == NULL)
266 |         return;
267 | 
268 |     for (field = 0; field < fields->num_fields; field++)
269 |     {
270 |         fields->field_info[field].converter = NULL;
271 |         fields->field_info[field].converter_arg = NULL;
272 |     }
273 | }
274 | 
275 | 
276 | void set_converter(FieldList *fields, uint32_t field_num, char *field_name,
277 |     uint32_t output_field_size, converter_func_ptr converter,
278 |     void *converter_arg)
279 | {
280 |     if (fields == NULL)
281 |         return;
282 | 
283 |     if (field_num >= fields->num_fields)
284 |         return;
285 | 
286 |     //if (field_name == NULL)
287 |     //    return;
288 | 
289 |     if (fields->field_info[field_num].name != NULL)
290 |     {
291 |         free(fields->field_info[field_num].name);
292 |     }
293 | 
294 |     if (field_name != NULL)
295 |     {
296 |         fields->field_info[field_num].name =
297 |             calloc(strlen(field_name), sizeof(char));
298 |         strncpy(fields->field_info[field_num].name, field_name, strlen(field_name));
299 |     }
300 |     else
301 |     {
302 |         fields->field_info[field_num].name = NULL;
303 |     }
304 | 
305 |     fields->field_info[field_num].converter = converter;
306 |     fields->field_info[field_num].converter_arg = converter_arg;
307 |     fields->field_info[field_num].output_field_size = output_field_size;
308 | }
309 | 
310 | 
311 | int infer_types(FieldList *fields)
312 | {
313 |     uint32_t i;
314 | 
315 |     for (i = 0; i < fields->num_fields; i++)
316 |     {
317 |         if (fields->field_info[i].infer_type == 1)
318 |             return 1;
319 |     }
320 | 
321 |     return 0;
322 | }
323 | 


--------------------------------------------------------------------------------
/textadapter/lib/field_info.h:
--------------------------------------------------------------------------------
 1 | #ifndef FIELD_INFO_H
 2 | #define FIELD_INFO_H
 3 | 
 4 | #include "converter_functions.h"
 5 | 
 6 | 
 7 | typedef struct missing_values_t
 8 | {
 9 |     char **missing_values;
10 |     uint32_t *missing_value_lens;
11 |     uint32_t num_missing_values;
12 | } MissingValues;
13 | 
14 | 
15 | typedef struct fill_value_t
16 | {
17 |     void *fill_value;
18 |     int loose;
19 | } FillValue;
20 | 
21 | 
22 | typedef struct field_info_t
23 | {
24 |     char *name;
25 | 
26 |     /* converter function to convert data to target data type */
27 |     converter_func_ptr converter;
28 |     void *converter_arg;
29 | 
30 |     MissingValues missing_values;
31 | 
32 |     FillValue fill_value;
33 | 
34 |     /* field width for fixed width data */
35 |     uint32_t input_field_width;
36 | 
37 |     /* field size in output array */
38 |     uint32_t output_field_size;
39 | 
40 |     /* flag allows user to fix the type. default, though, is to infer_type */
41 |     int infer_type;
42 | 
43 | } FieldInfo;
44 | 
45 | 
46 | typedef struct field_list_t
47 | {
48 |     uint32_t num_fields;
49 |     FieldInfo *field_info;
50 | } FieldList;
51 | 
52 | 
53 | void clear_fields(FieldList *fields);
54 | void set_num_fields(FieldList *fields, uint32_t num_fields);
55 | 
56 | void clear_missing_values(MissingValues *missing_values);
57 | void clear_fill_value(FillValue *fill_value);
58 | 
59 | void init_missing_values(FieldList *fields, char *field_name,
60 |     uint32_t field_num, uint32_t num_missing_values);
61 | 
62 | void add_missing_value(FieldList *fields, char *field_name,
63 |     uint32_t field_num, char *missing_value, uint32_t missing_value_len);
64 | 
65 | void set_fill_value(FieldList *fields, char *field_name,
66 |     uint32_t field_num, void *fill_value, uint32_t fill_value_len, int loose);
67 | 
68 | uint32_t get_field_size(FieldList *fields, char *field_name,
69 |     uint32_t field_num);
70 | uint32_t get_output_record_size(FieldList *fields);
71 | 
72 | void set_field_width(FieldList *fields, uint32_t field, uint32_t width);
73 | 
74 | /* Resets converter function pointers to null */
75 | void reset_converters(FieldList *fields);
76 | 
77 | /* Sets converter function for specified field with specified field size.
78 |  * converter_arg will be passed to converter function when called. */
79 | void set_converter(FieldList *fields, uint32_t field_num, char *field_name,
80 |     uint32_t output_field_size, converter_func_ptr converter,
81 |     void *converter_arg);
82 | 
83 | /* Initialize the type of each of the fields to be inferred */
84 | void init_infer_types(FieldList *fields);
85 | 
86 | int infer_types(FieldList *fields);
87 | 
88 | #endif
89 | 


--------------------------------------------------------------------------------
/textadapter/lib/kstring.c:
--------------------------------------------------------------------------------
  1 | #include <stdarg.h>
  2 | #include <stdio.h>
  3 | #include <ctype.h>
  4 | #include <string.h>
  5 | #include "_stdint.h"
  6 | #include "kstring.h"
  7 | 
  8 | #ifdef _MSC_VER
  9 | #define va_copy(d,s) ((d)=(s))
 10 | #endif
 11 | 
 12 | int kvsprintf(kstring_t *s, const char *fmt, va_list ap)
 13 | {
 14 | 	va_list args;
 15 | 	int l;
 16 | 	va_copy(args, ap);
 17 | 	l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'.
 18 |     va_end(args);
 19 | #ifdef _MSC_VER
 20 |     if (l == -1) {
 21 | 	    va_copy(args, ap);
 22 |         l = _vscprintf(fmt, args);
 23 |         va_end(args);
 24 | #else
 25 | 	if (l + 1 > s->m - s->l) {
 26 | #endif
 27 | 		s->m = s->l + l + 2;
 28 | 		kroundup32(s->m);
 29 | 		s->s = (char*)realloc(s->s, s->m);
 30 | 	    va_copy(args, ap);
 31 | 		l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args);
 32 |         va_end(args);
 33 | 	}
 34 | 	s->l += l;
 35 | 	return l;
 36 | }
 37 | 
 38 | int ksprintf(kstring_t *s, const char *fmt, ...)
 39 | {
 40 | 	va_list ap;
 41 | 	int l;
 42 | 	va_start(ap, fmt);
 43 | 	l = kvsprintf(s, fmt, ap);
 44 | 	va_end(ap);
 45 | 	return l;
 46 | }
 47 | 
 48 | char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux)
 49 | {
 50 | 	const char *p, *start;
 51 | 	if (sep) { // set up the table
 52 | 		if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished
 53 | 		aux->finished = 0;
 54 | 		if (sep[1]) {
 55 | 			aux->sep = -1;
 56 | 			aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
 57 | 			for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
 58 | 		} else aux->sep = sep[0];
 59 | 	}
 60 | 	if (aux->finished) return 0;
 61 | 	else if (str) aux->p = str - 1, aux->finished = 0;
 62 | 	if (aux->sep < 0) {
 63 | 		for (p = start = aux->p + 1; *p; ++p)
 64 | 			if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
 65 | 	} else {
 66 | 		for (p = start = aux->p + 1; *p; ++p)
 67 | 			if (*p == aux->sep) break;
 68 | 	}
 69 | 	aux->p = p; // end of token
 70 | 	if (*p == 0) aux->finished = 1; // no more tokens
 71 | 	return (char*)start;
 72 | }
 73 | 
 74 | // s MUST BE a null terminated string; l = strlen(s)
 75 | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
 76 | {
 77 | 	int i, n, max, last_char, last_start, *offsets, l;
 78 | 	n = 0; max = *_max; offsets = *_offsets;
 79 | 	l = strlen(s);
 80 | 	
 81 | #define __ksplit_aux do {						\
 82 | 		if (_offsets) {						\
 83 | 			s[i] = 0;					\
 84 | 			if (n == max) {					\
 85 | 				int *tmp;				\
 86 | 				max = max? max<<1 : 2;			\
 87 | 				if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) {  \
 88 | 					offsets = tmp;			\
 89 | 				} else	{				\
 90 | 					free(offsets);			\
 91 | 					*_offsets = NULL;		\
 92 | 					return 0;			\
 93 | 				}					\
 94 | 			}						\
 95 | 			offsets[n++] = last_start;			\
 96 | 		} else ++n;						\
 97 | 	} while (0)
 98 | 
 99 | 	for (i = 0, last_char = last_start = 0; i <= l; ++i) {
100 | 		if (delimiter == 0) {
101 | 			if (isspace(s[i]) || s[i] == 0) {
102 | 				if (isgraph(last_char)) __ksplit_aux; // the end of a field
103 | 			} else {
104 | 				if (isspace(last_char) || last_char == 0) last_start = i;
105 | 			}
106 | 		} else {
107 | 			if (s[i] == delimiter || s[i] == 0) {
108 | 				if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
109 | 			} else {
110 | 				if (last_char == delimiter || last_char == 0) last_start = i;
111 | 			}
112 | 		}
113 | 		last_char = s[i];
114 | 	}
115 | 	*_max = max; *_offsets = offsets;
116 | 	return n;
117 | }
118 | 
119 | int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp)
120 | {
121 | 	size_t l0 = s->l;
122 | 
123 | 	while (s->l == l0 || s->s[s->l-1] != '\n') {
124 | 		if (s->m - s->l < 200) ks_resize(s, s->m + 200);
125 | 		if (fgets_fn(s->s + s->l, s->m - s->l, fp) == NULL) break;
126 | 		s->l += strlen(s->s + s->l);
127 | 	}
128 | 
129 | 	if (s->l == l0) return EOF;
130 | 
131 | 	if (s->l > l0 && s->s[s->l-1] == '\n') {
132 | 		s->l--;
133 | 		if (s->l > l0 && s->s[s->l-1] == '\r') s->l--;
134 | 	}
135 | 	s->s[s->l] = '\0';
136 | 	return 0;
137 | }
138 | 
139 | /**********************
140 |  * Boyer-Moore search *
141 |  **********************/
142 | 
143 | typedef unsigned char ubyte_t;
144 | 
145 | // reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
146 | static int *ksBM_prep(const ubyte_t *pat, int m)
147 | {
148 | 	int i, *suff, *prep, *bmGs, *bmBc;
149 | 	prep = (int*)calloc(m + 256, sizeof(int));
150 | 	bmGs = prep; bmBc = prep + m;
151 | 	{ // preBmBc()
152 | 		for (i = 0; i < 256; ++i) bmBc[i] = m;
153 | 		for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
154 | 	}
155 | 	suff = (int*)calloc(m, sizeof(int));
156 | 	{ // suffixes()
157 | 		int f = 0, g;
158 | 		suff[m - 1] = m;
159 | 		g = m - 1;
160 | 		for (i = m - 2; i >= 0; --i) {
161 | 			if (i > g && suff[i + m - 1 - f] < i - g)
162 | 				suff[i] = suff[i + m - 1 - f];
163 | 			else {
164 | 				if (i < g) g = i;
165 | 				f = i;
166 | 				while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
167 | 				suff[i] = f - g;
168 | 			}
169 | 		}
170 | 	}
171 | 	{ // preBmGs()
172 | 		int j = 0;
173 | 		for (i = 0; i < m; ++i) bmGs[i] = m;
174 | 		for (i = m - 1; i >= 0; --i)
175 | 			if (suff[i] == i + 1)
176 | 				for (; j < m - 1 - i; ++j)
177 | 					if (bmGs[j] == m)
178 | 						bmGs[j] = m - 1 - i;
179 | 		for (i = 0; i <= m - 2; ++i)
180 | 			bmGs[m - 1 - suff[i]] = m - 1 - i;
181 | 	}
182 | 	free(suff);
183 | 	return prep;
184 | }
185 | 
186 | void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep)
187 | {
188 | 	int i, j, *prep = 0, *bmGs, *bmBc;
189 | 	const ubyte_t *str, *pat;
190 | 	str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat;
191 | 	prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep;
192 | 	if (_prep && *_prep == 0) *_prep = prep;
193 | 	bmGs = prep; bmBc = prep + m;
194 | 	j = 0;
195 | 	while (j <= n - m) {
196 | 		for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
197 | 		if (i >= 0) {
198 | 			int max = bmBc[str[i+j]] - m + 1 + i;
199 | 			if (max < bmGs[i]) max = bmGs[i];
200 | 			j += max;
201 | 		} else return (void*)(str + j);
202 | 	}
203 | 	if (_prep == 0) free(prep);
204 | 	return 0;
205 | }
206 | 
207 | char *kstrstr(const char *str, const char *pat, int **_prep)
208 | {
209 | 	return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep);
210 | }
211 | 
212 | char *kstrnstr(const char *str, const char *pat, int n, int **_prep)
213 | {
214 | 	return (char*)kmemmem(str, n, pat, strlen(pat), _prep);
215 | }
216 | 
217 | /***********************
218 |  * The main() function *
219 |  ***********************/
220 | 
221 | #ifdef KSTRING_MAIN
222 | #include <stdio.h>
223 | int main()
224 | {
225 | 	kstring_t *s;
226 | 	int *fields, n, i;
227 | 	ks_tokaux_t aux;
228 | 	char *p;
229 | 	s = (kstring_t*)calloc(1, sizeof(kstring_t));
230 | 	// test ksprintf()
231 | 	ksprintf(s, " abcdefg:    %d ", 100);
232 | 	printf("'%s'\n", s->s);
233 | 	// test ksplit()
234 | 	fields = ksplit(s, 0, &n);
235 | 	for (i = 0; i < n; ++i)
236 | 		printf("field[%d] = '%s'\n", i, s->s + fields[i]);
237 | 	// test kstrtok()
238 | 	s->l = 0;
239 | 	for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
240 | 		kputsn(p, aux.p - p, s);
241 | 		kputc('\n', s);
242 | 	}
243 | 	printf("%s", s->s);
244 | 	// free
245 | 	free(s->s); free(s); free(fields);
246 | 
247 | 	{
248 | 		static char *str = "abcdefgcdgcagtcakcdcd";
249 | 		static char *pat = "cd";
250 | 		char *ret, *s = str;
251 | 		int *prep = 0;
252 | 		while ((ret = kstrstr(s, pat, &prep)) != 0) {
253 | 			printf("match: %s\n", ret);
254 | 			s = ret + prep[0];
255 | 		}
256 | 		free(prep);
257 | 	}
258 | 	return 0;
259 | }
260 | #endif
261 | 


--------------------------------------------------------------------------------
/textadapter/lib/kstring.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) by Attractive Chaos <attractor@live.co.uk> 
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | #ifndef KSTRING_H
 27 | #define KSTRING_H
 28 | 
 29 | #include <stdlib.h>
 30 | #include <string.h>
 31 | #include <stdarg.h>
 32 | #include "_stdint.h"
 33 | #include <stdio.h>
 34 | 
 35 | #ifdef _MSC_VER
 36 | #define inline __inline
 37 | #endif
 38 | 
 39 | #ifndef kroundup32
 40 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
 41 | #endif
 42 | 
 43 | #if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
 44 | #define KS_ATTR_PRINTF(fmt, arg) __attribute__((__format__ (__printf__, fmt, arg)))
 45 | #else
 46 | #define KS_ATTR_PRINTF(fmt, arg)
 47 | #endif
 48 | 
 49 | 
 50 | /* kstring_t is a simple non-opaque type whose fields are likely to be
 51 |  * used directly by user code (but see also ks_str() and ks_len() below).
 52 |  * A kstring_t object is initialised by either of
 53 |  *       kstring_t str = { 0, 0, NULL };
 54 |  *       kstring_t str; ...; str.l = str.m = 0; str.s = NULL;
 55 |  * and either ownership of the underlying buffer should be given away before
 56 |  * the object disappears (see ks_release() below) or the kstring_t should be
 57 |  * destroyed with  free(str.s);  */
 58 | #ifndef KSTRING_T
 59 | #define KSTRING_T kstring_t
 60 | typedef struct __kstring_t {
 61 | 	size_t l, m;
 62 | 	char *s;
 63 | } kstring_t;
 64 | #endif
 65 | 
 66 | typedef struct {
 67 | 	uint64_t tab[4];
 68 | 	int sep, finished;
 69 | 	const char *p; // end of the current token
 70 | } ks_tokaux_t;
 71 | 
 72 | #ifdef __cplusplus
 73 | extern "C" {
 74 | #endif
 75 | 
 76 | 	int kvsprintf(kstring_t *s, const char *fmt, va_list ap) KS_ATTR_PRINTF(2,0);
 77 | 	int ksprintf(kstring_t *s, const char *fmt, ...) KS_ATTR_PRINTF(2,3);
 78 | 	int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
 79 | 	char *kstrstr(const char *str, const char *pat, int **_prep);
 80 | 	char *kstrnstr(const char *str, const char *pat, int n, int **_prep);
 81 | 	void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep);
 82 | 
 83 | 	/* kstrtok() is similar to strtok_r() except that str is not
 84 | 	 * modified and both str and sep can be NULL. For efficiency, it is
 85 | 	 * actually recommended to set both to NULL in the subsequent calls
 86 | 	 * if sep is not changed. */
 87 | 	char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux);
 88 | 
 89 | 	/* kgetline() uses the supplied fgets()-like function to read a "\n"-
 90 | 	 * or "\r\n"-terminated line from fp.  The line read is appended to the
 91 | 	 * kstring without its terminator and 0 is returned; EOF is returned at
 92 | 	 * EOF or on error (determined by querying fp, as per fgets()). */
 93 | 	typedef char *kgets_func(char *, int, void *);
 94 | 	int kgetline(kstring_t *s, kgets_func *fgets, void *fp);
 95 | 
 96 | #ifdef __cplusplus
 97 | }
 98 | #endif
 99 | 
100 | static inline int ks_resize(kstring_t *s, size_t size)
101 | {
102 | 	if (s->m < size) {
103 | 		char *tmp;
104 | 		s->m = size;
105 | 		kroundup32(s->m);
106 | 		if ((tmp = (char*)realloc(s->s, s->m)))
107 | 			s->s = tmp;
108 | 		else
109 | 			return -1;
110 | 	}
111 | 	return 0;
112 | }
113 | 
114 | static inline char *ks_str(kstring_t *s)
115 | {
116 | 	return s->s;
117 | }
118 | 
119 | static inline size_t ks_len(kstring_t *s)
120 | {
121 | 	return s->l;
122 | }
123 | 
124 | // Give ownership of the underlying buffer away to something else (making
125 | // that something else responsible for freeing it), leaving the kstring_t
126 | // empty and ready to be used again, or ready to go out of scope without
127 | // needing  free(str.s)  to prevent a memory leak.
128 | static inline char *ks_release(kstring_t *s)
129 | {
130 | 	char *ss = s->s;
131 | 	s->l = s->m = 0;
132 | 	s->s = NULL;
133 | 	return ss;
134 | }
135 | 
136 | static inline int kputsn(const char *p, int l, kstring_t *s)
137 | {
138 | 	if (s->l + l + 1 >= s->m) {
139 | 		char *tmp;
140 | 		s->m = s->l + l + 2;
141 | 		kroundup32(s->m);
142 | 		if ((tmp = (char*)realloc(s->s, s->m)))
143 | 			s->s = tmp;
144 | 		else
145 | 			return EOF;
146 | 	}
147 | 	memcpy(s->s + s->l, p, l);
148 | 	s->l += l;
149 | 	s->s[s->l] = 0;
150 | 	return l;
151 | }
152 | 
153 | static inline int kputs(const char *p, kstring_t *s)
154 | {
155 | 	return kputsn(p, strlen(p), s);
156 | }
157 | 
158 | static inline int kputc(int c, kstring_t *s)
159 | {
160 | 	if (s->l + 1 >= s->m) {
161 | 		char *tmp;
162 | 		s->m = s->l + 2;
163 | 		kroundup32(s->m);
164 | 		if ((tmp = (char*)realloc(s->s, s->m)))
165 | 			s->s = tmp;
166 | 		else
167 | 			return EOF;
168 | 	}
169 | 	s->s[s->l++] = c;
170 | 	s->s[s->l] = 0;
171 | 	return c;
172 | }
173 | 
174 | static inline int kputc_(int c, kstring_t *s)
175 | {
176 | 	if (s->l + 1 > s->m) {
177 | 		char *tmp;
178 | 		s->m = s->l + 1;
179 | 		kroundup32(s->m);
180 | 		if ((tmp = (char*)realloc(s->s, s->m)))
181 | 			s->s = tmp;
182 | 		else
183 | 			return EOF;
184 | 	}
185 | 	s->s[s->l++] = c;
186 | 	return 1;
187 | }
188 | 
189 | static inline int kputsn_(const void *p, int l, kstring_t *s)
190 | {
191 | 	if (s->l + l > s->m) {
192 | 		char *tmp;
193 | 		s->m = s->l + l;
194 | 		kroundup32(s->m);
195 | 		if ((tmp = (char*)realloc(s->s, s->m)))
196 | 			s->s = tmp;
197 | 		else
198 | 			return EOF;
199 | 	}
200 | 	memcpy(s->s + s->l, p, l);
201 | 	s->l += l;
202 | 	return l;
203 | }
204 | 
205 | static inline int kputw(int c, kstring_t *s)
206 | {
207 | 	char buf[16];
208 | 	int i, l = 0;
209 | 	unsigned int x = c;
210 | 	if (c < 0) x = -x;
211 | 	do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
212 | 	if (c < 0) buf[l++] = '-';
213 | 	if (s->l + l + 1 >= s->m) {
214 | 		char *tmp;
215 | 		s->m = s->l + l + 2;
216 | 		kroundup32(s->m);
217 | 		if ((tmp = (char*)realloc(s->s, s->m)))
218 | 			s->s = tmp;
219 | 		else
220 | 			return EOF;
221 | 	}
222 | 	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
223 | 	s->s[s->l] = 0;
224 | 	return 0;
225 | }
226 | 
227 | static inline int kputuw(unsigned c, kstring_t *s)
228 | {
229 | 	char buf[16];
230 | 	int l, i;
231 | 	unsigned x;
232 | 	if (c == 0) return kputc('0', s);
233 | 	for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
234 | 	if (s->l + l + 1 >= s->m) {
235 | 		char *tmp;
236 | 		s->m = s->l + l + 2;
237 | 		kroundup32(s->m);
238 | 		if ((tmp = (char*)realloc(s->s, s->m)))
239 | 			s->s = tmp;
240 | 		else
241 | 			return EOF;
242 | 	}
243 | 	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
244 | 	s->s[s->l] = 0;
245 | 	return 0;
246 | }
247 | 
248 | static inline int kputl(long c, kstring_t *s)
249 | {
250 | 	char buf[32];
251 | 	int i, l = 0;
252 | 	unsigned long x = c;
253 | 	if (c < 0) x = -x;
254 | 	do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
255 | 	if (c < 0) buf[l++] = '-';
256 | 	if (s->l + l + 1 >= s->m) {
257 | 		char *tmp;
258 | 		s->m = s->l + l + 2;
259 | 		kroundup32(s->m);
260 | 		if ((tmp = (char*)realloc(s->s, s->m)))
261 | 			s->s = tmp;
262 | 		else
263 | 			return EOF;
264 | 	}
265 | 	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
266 | 	s->s[s->l] = 0;
267 | 	return 0;
268 | }
269 | 
270 | /*
271 |  * Returns 's' split by delimiter, with *n being the number of components;
272 |  *         NULL on failue.
273 |  */
274 | static inline int *ksplit(kstring_t *s, int delimiter, int *n)
275 | {
276 | 	int max = 0, *offsets = 0;
277 | 	*n = ksplit_core(s->s, delimiter, &max, &offsets);
278 | 	return offsets;
279 | }
280 | 
281 | #endif
282 | 


--------------------------------------------------------------------------------
/textadapter/lib/kvec.h:
--------------------------------------------------------------------------------
 1 | /* The MIT License
 2 | 
 3 |    Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
 4 | 
 5 |    Permission is hereby granted, free of charge, to any person obtaining
 6 |    a copy of this software and associated documentation files (the
 7 |    "Software"), to deal in the Software without restriction, including
 8 |    without limitation the rights to use, copy, modify, merge, publish,
 9 |    distribute, sublicense, and/or sell copies of the Software, and to
10 |    permit persons to whom the Software is furnished to do so, subject to
11 |    the following conditions:
12 | 
13 |    The above copyright notice and this permission notice shall be
14 |    included in all copies or substantial portions of the Software.
15 | 
16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 |    SOFTWARE.
24 | */
25 | 
26 | /*
27 |   An example:
28 | 
29 | #include "kvec.h"
30 | int main() {
31 | 	kvec_t(int) array;
32 | 	kv_init(array);
33 | 	kv_push(int, array, 10); // append
34 | 	kv_a(int, array, 20) = 5; // dynamic
35 | 	kv_A(array, 20) = 4; // static
36 | 	kv_destroy(array);
37 | 	return 0;
38 | }
39 | */
40 | 
41 | /*
42 |   2008-09-22 (0.1.0):
43 | 
44 | 	* The initial version.
45 | 
46 | */
47 | 
48 | #ifndef AC_KVEC_H
49 | #define AC_KVEC_H
50 | 
51 | #include <stdlib.h>
52 | 
53 | #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
54 | 
55 | #define kvec_t(type) struct { size_t n, m; type *a; }
56 | #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
57 | #define kv_destroy(v) free((v).a)
58 | #define kv_A(v, i) ((v).a[(i)])
59 | #define kv_pop(v) ((v).a[--(v).n])
60 | #define kv_size(v) ((v).n)
61 | #define kv_max(v) ((v).m)
62 | 
63 | #define kv_resize(type, v, s)  ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m))
64 | 
65 | #define kv_copy(type, v1, v0) do {							\
66 | 		if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n);	\
67 | 		(v1).n = (v0).n;									\
68 | 		memcpy((v1).a, (v0).a, sizeof(type) * (v0).n);		\
69 | 	} while (0)												\
70 | 
71 | #define kv_push(type, v, x) do {									\
72 | 		if ((v).n == (v).m) {										\
73 | 			(v).m = (v).m? (v).m<<1 : 2;							\
74 | 			(v).a = (type*)realloc((v).a, sizeof(type) * (v).m);	\
75 | 		}															\
76 | 		(v).a[(v).n++] = (x);										\
77 | 	} while (0)
78 | 
79 | #define kv_pushp(type, v) (((v).n == (v).m)?							\
80 | 						   ((v).m = ((v).m? (v).m<<1 : 2),				\
81 | 							(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0)	\
82 | 						   : 0), ((v).a + ((v).n++))
83 | 
84 | #define kv_a(type, v, i) ((v).m <= (size_t)(i)?						\
85 | 						  ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
86 | 						   (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
87 | 						  : (v).n <= (size_t)(i)? (v).n = (i)			\
88 | 						  : 0), (v).a[(i)]
89 | 
90 | #endif
91 | 


--------------------------------------------------------------------------------
/textadapter/tests/Makefile:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | CFLAGS = -g -Werror -Wall -Wdeclaration-after-statement
 3 | TEXT_INCLUDE_DIRS = -I ../textadapter -I ../lib
 4 | TEXT_LIBS = -lz -lpcre
 5 | TEXT_OBJS = test_text_adapter.o text_adapter.o converter_functions.o index.o
 6 | 
 7 | 
 8 | test_text_adapter: $(TEXT_OBJS)
 9 | 	$(CC) $(CFLAGS) $(TEXT_OBJS) -o test_text_adapter $(TEXT_LIBS)
10 | 
11 | test_text_adapter.o: test_text_adapter.c
12 | 	$(CC) $(CFLAGS) -c test_text_adapter.c $(TEXT_INCLUDE_DIRS)
13 | 
14 | text_adapter.o: ../textadapter/text_adapter.c
15 | 	$(CC) $(CFLAGS) -c ../textadapter/text_adapter.c $(TEXT_INCLUDE_DIRS)
16 | 
17 | converter_functions.o: ../lib/converter_functions.c
18 | 	$(CC) $(CFLAGS) -c ../lib/converter_functions.c $(TEXT_INCLUDE_DIRS)
19 | 
20 | index.o: ../textadapter/index.c
21 | 	$(CC) $(CFLAGS) -c ../textadapter/index.c $(TEXT_INCLUDE_DIRS)
22 | 
23 | field_info.o: ../lib/field_info.c
24 | 	$(CC) $(CFLAGS) -c ../lib/field_info.c $(TEXT_INCLUDE_DIRS)
25 | 
26 | clean:
27 | 	-rm test_text_adapter
28 | 	-rm -f *.o
29 | 
30 | 


--------------------------------------------------------------------------------
/textadapter/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContinuumIO/TextAdapter/53138c2277cdfcf32e127251313d4f77f81050aa/textadapter/tests/__init__.py


--------------------------------------------------------------------------------
/textadapter/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | def pytest_addoption(parser):
 4 |     parser.addoption('--pg_host', action='store')
 5 |     parser.addoption('--pg_dbname', action='store')
 6 |     parser.addoption('--pg_user', action='store')
 7 |     parser.addoption('--acc_host', action='store')
 8 |     parser.addoption('--acc_user', action='store')
 9 |     parser.addoption('--acc_password', action='store')
10 | 


--------------------------------------------------------------------------------
/textadapter/tests/data/benchmarks.py:
--------------------------------------------------------------------------------
 1 | import timeit
 2 | import os
 3 | 
 4 | 
 5 | def timeFunction(function, setup):
 6 |     print 'timing', function
 7 |     t = timeit.Timer(stmt=function, setup=setup)
 8 |     times = []
 9 |     for i in range(0,3):
10 |         os.system('sudo sh -c "sync; echo 3 > /proc/sys/vm/drop_caches"')
11 |         times.append(str(t.timeit(number=1)))
12 |     return min(times)
13 | 
14 | 
15 | ints1 = timeFunction('blazeopt.loadtxt("ints1", dtype="u4,u4,u4,u4,u4", delimiter=",")', 'import blazeopt')
16 | ints2 = timeFunction('blazeopt.loadtxt("ints2", dtype="u4,u4,u4,u4,u4", delimiter=",")', 'import blazeopt')
17 | ints3 = timeFunction('blazeopt.loadtxt("ints3", dtype="u4,u4,u4,u4,u4", delimiter=",")', 'import blazeopt')
18 | print ints1, ints2, ints3
19 | 
20 | floats1 = timeFunction('blazeopt.loadtxt("floats1", dtype="f8,f8,f8,f8,f8", delimiter=",")', 'import blazeopt')
21 | floats2 = timeFunction('blazeopt.loadtxt("floats2", dtype="f8,f8,f8,f8,f8", delimiter=",")', 'import blazeopt')
22 | floats3 = timeFunction('blazeopt.loadtxt("floats3", dtype="f8,f8,f8,f8,f8", delimiter=",")', 'import blazeopt')
23 | print floats1, floats2, floats3
24 | 
25 | ints1 = timeFunction('blazeopt.genfromtxt("ints1", dtype="u4,u4,u4,u4,u4", delimiter=",")', 'import blazeopt')
26 | ints2 = timeFunction('blazeopt.genfromtxt("ints2", dtype="u4,u4,u4,u4,u4", delimiter=",")', 'import blazeopt')
27 | ints3 = timeFunction('blazeopt.genfromtxt("ints3", dtype="u4,u4,u4,u4,u4", delimiter=",")', 'import blazeopt')
28 | print ints1, ints2, ints3
29 | 
30 | floats1 = timeFunction('blazeopt.genfromtxt("floats1", dtype="f8,f8,f8,f8,f8", delimiter=",")', 'import blazeopt')
31 | floats2 = timeFunction('blazeopt.genfromtxt("floats2", dtype="f8,f8,f8,f8,f8", delimiter=",")', 'import blazeopt')
32 | floats3 = timeFunction('blazeopt.genfromtxt("floats3", dtype="f8,f8,f8,f8,f8", delimiter=",")', 'import blazeopt')
33 | print floats1, floats2, floats3
34 | 
35 | missingValues1 = timeFunction('blazeopt.genfromtxt("missingvalues1", dtype="u4,u4,u4,u4,u4", delimiter=",", missing_values={0:["NA","NaN"], 1:["xx","inf"]}, filling_values="999")', 'import blazeopt')
36 | missingValues2 = timeFunction('blazeopt.genfromtxt("missingvalues2", dtype="u4,u4,u4,u4,u4", delimiter=",", missing_values={0:["NA","NaN"], 1:["xx","inf"]}, filling_values="999")', 'import blazeopt')
37 | missingValues3 = timeFunction('blazeopt.genfromtxt("missingvalues3", dtype="u4,u4,u4,u4,u4", delimiter=",", missing_values={0:["NA","NaN"], 1:["xx","inf"]}, filling_values="999")', 'import blazeopt')
38 | print missingValues1, missingValues2, missingValues3
39 | 
40 | fixedwidth1 = timeFunction('blazeopt.genfromtxt("fixedwidth1", dtype="u4,u4,u4,u4,u4", delimiter=[2,3,4,5,6])', 'import blazeopt')
41 | fixedwidth2 = timeFunction('blazeopt.genfromtxt("fixedwidth2", dtype="u4,u4,u4,u4,u4", delimiter=[2,3,4,5,6])', 'import blazeopt')
42 | fixedwidth3 = timeFunction('blazeopt.genfromtxt("fixedwidth3", dtype="u4,u4,u4,u4,u4", delimiter=[2,3,4,5,6])', 'import blazeopt')
43 | print fixedwidth1, fixedwidth2, fixedwidth3
44 | 
45 | 


--------------------------------------------------------------------------------
/textadapter/tests/generate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import time
  4 | import gzip
  5 | import numpy
  6 | 
  7 | 
  8 | def generate_dataset(output, valueIter, delimiter, num_recs):
  9 |     for i in range(0, num_recs):
 10 |         line = ''
 11 |         for j in range(0, 5):
 12 |             if j == 5 - 1:
 13 |                 line += str(valueIter.next())
 14 |             else:
 15 |                 line += str(valueIter.next()) + delimiter
 16 |         output.write(line)
 17 |         output.write('\n')
 18 |     output.seek(0)
 19 | 
 20 | 
 21 | class IntIter(object):
 22 | 
 23 |     def __init__(self):
 24 |         self.value = 0
 25 | 
 26 |     def __str__(self):
 27 |         return 'ints'
 28 | 
 29 |     def __iter__(self):
 30 |         return self
 31 | 
 32 |     def next(self):
 33 |         nextValue = self.value
 34 |         self.value = self.value + 1
 35 |         return nextValue
 36 | 
 37 | 
 38 | class SignedIntIter(object):
 39 | 
 40 |     def __init__(self):
 41 |         self.value = -1
 42 | 
 43 |     def __str__(self):
 44 |         return 'signed int'
 45 | 
 46 |     def __iter__(self):
 47 |         return self
 48 | 
 49 |     def next(self):
 50 |         nextValue = self.value
 51 |         if self.value < 0:
 52 |             self.value = self.value - 1
 53 |         else:
 54 |             self.value = self.value + 1
 55 |         self.value *= -1
 56 |         return nextValue
 57 | 
 58 | 
 59 | class FloatIter(object):
 60 | 
 61 |     def __init__(self):
 62 |         self.value = 0.0
 63 | 
 64 |     def __str__(self):
 65 |         return 'floats'
 66 | 
 67 |     def __iter__(self):
 68 |         return self
 69 | 
 70 |     def next(self):
 71 |         nextValue = self.value
 72 |         self.value = self.value + 0.1
 73 |         return nextValue
 74 | 
 75 | 
 76 | class MissingValuesIter(object):
 77 | 
 78 |     def __init__(self):
 79 |         self.value = 0
 80 | 
 81 |     def __str__(self):
 82 |         return 'missing values'
 83 | 
 84 |     def __iter__(self):
 85 |         return self
 86 | 
 87 |     def next(self):
 88 |         nextValue = self.value
 89 |         if nextValue % 20 == 0:
 90 |             nextValue = 'NA'
 91 |         elif nextValue % 20 == 4:
 92 |             nextValue = 'xx'
 93 |         elif nextValue % 20 == 5:
 94 |             nextValue = 'NaN'
 95 |         elif nextValue % 20 == 9:
 96 |             nextValue = 'inf'
 97 |         self.value = self.value + 1
 98 |         return nextValue
 99 | 
100 | 
101 | class FixedWidthIter(object):
102 |     
103 |     def __init__(self):
104 |         self.field = 0
105 |         self.fieldValues = ['00','000','0000','00000','000000']
106 | 
107 |     def __str__(self):
108 |         return 'fixed widths'
109 | 
110 |     def __iter__(self):
111 |         return self
112 | 
113 |     def next(self):
114 |         nextValue = self.fieldValues[self.field]
115 | 
116 |         self.field = self.field + 1
117 |         if self.field == 5:
118 |             self.field = 0
119 |             self.fieldValues[0] = str((int(self.fieldValues[0]) + 1) % 100).zfill(2)
120 |             self.fieldValues[1] = str((int(self.fieldValues[1]) + 1) % 1000).zfill(3)
121 |             self.fieldValues[2] = str((int(self.fieldValues[2]) + 1) % 10000).zfill(4)
122 |             self.fieldValues[3] = str((int(self.fieldValues[3]) + 1) % 100000).zfill(5)
123 |             self.fieldValues[4] = str((int(self.fieldValues[4]) + 1) % 1000000).zfill(6)
124 | 
125 |         return nextValue
126 | 
127 | 
128 | class QuoteIter(object):
129 | 
130 |     def __init__(self):
131 |         self.value = 0
132 | 
133 |     def __str__(self):
134 |         return 'quoted strings'
135 | 
136 |     def __iter__(self):
137 |         return self
138 | 
139 |     def next(self):
140 |         nextValue = self.value
141 |         characters = list(str(nextValue))
142 |         nextValue = '"' + ',\n'.join(characters) + '"'
143 | 
144 |         self.value = self.value + 1
145 |         return nextValue
146 | 
147 | 
148 | class DateTimeIter(object):
149 | 
150 |     def __init__(self):
151 |         self.value = 0
152 | 
153 |     def __str__(self):
154 |         return 'datetime'
155 | 
156 |     def __iter__(self):
157 |         return self
158 | 
159 |     def next(self):
160 |         nextValue = self.value
161 |         self.value = self.value + 1
162 |         return numpy.datetime64(nextValue, 'D')
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     import sys
167 |     if len(sys.argv) != 2:
168 |         sys.exit("Please define number of records in datasets: ")
169 | 
170 |     numRecords = int(sys.argv[1])
171 | 
172 |     output = open('./data/ints', 'w')
173 |     generate_dataset(output, IntIter(), ',', numRecords)
174 |     output.close()
175 | 
176 |     output = open('./data/floats', 'w')
177 |     generate_dataset(output, FloatIter(), ',', numRecords)
178 |     output.close()
179 | 
180 |     output = open('./data/missingvalues', 'w')
181 |     generate_dataset(output, MissingValuesIter(), ',', numRecords)
182 |     output.close()
183 | 
184 |     output = open('./data/fixedwidths', 'w')
185 |     generate_dataset(output, FixedWidthIter(), '', numRecords)
186 |     output.close()
187 | 
188 |     input = open('./data/ints', 'rb')
189 |     output = gzip.open('./data/ints.gz', 'wb')
190 |     output.writelines(input)
191 |     output.close()
192 |     input.close
193 | 
194 |     '''generate_dataset('ints2', IntIter(), ',', 12500000)
195 |     generate_dataset('ints3', IntIter(), ',', 25000000)
196 |     generate_dataset('signedints1', SignedIntIter(), ',', 2500000)
197 |     generate_dataset('floats1', FloatIter(), ',', 1500000)
198 |     generate_dataset('floats2', FloatIter(), ',', 7500000)
199 |     generate_dataset('floats3', FloatIter(), ',', 15000000)
200 |     generate_dataset('missingvalues1', MissingValuesIter(), ',', 3000000)
201 |     generate_dataset('missingvalues2', MissingValuesIter(), ',', 15000000)
202 |     generate_dataset('missingvalues3', MissingValuesIter(), ',', 30000000)
203 |     generate_dataset('fixedwidth1', FixedWidthIter(), '', 5000000)
204 |     generate_dataset('fixedwidth2', FixedWidthIter(), '', 25000000)
205 |     generate_dataset('fixedwidth3', FixedWidthIter(), '', 50000000)
206 |     generate_dataset('ints_spacedelim', IntIter(), ' ', 2500000)
207 |     generate_dataset('quotes', QuoteIter(), ' ', 2500000)
208 |     generate_dataset('datetime', DateTimeIter(), ',', 2500000)'''
209 | 
210 | 


--------------------------------------------------------------------------------
/textadapter/tests/test_ints.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <time.h>
 5 | #include <assert.h>
 6 | #include "../textadapter/text_adapter.h"
 7 | #include "../textadapter/io_functions.h"
 8 | 
 9 | 
10 | int main()
11 | {
12 |     uint64_t num_fields = 5;
13 | 
14 |     FILE *input = fopen("./data/ints", "r");
15 |     setvbuf(input, NULL, _IONBF, 0);
16 |     
17 |     TextAdapter *adapter = open_text_adapter((void *)input, NULL, &read_file, NULL, &seek_file, NULL);
18 |     adapter->tokenize = &delim_tokenizer;
19 |     set_num_fields(adapter, num_fields);
20 |     adapter->delim_char = ',';
21 |     adapter->quote_char = '\0';
22 |     adapter->comment_char = '\0';
23 | 
24 |     int c;
25 |     for (c = 0; c < num_fields; c++)
26 |     {
27 |         set_converter(adapter, c, sizeof(uint32_t), &uint_converter, NULL);
28 |     }
29 | 
30 |     uint32_t *data = calloc(10000000, sizeof(uint32_t)*num_fields);
31 | 
32 |     fseek(input, 0, SEEK_SET);
33 | 
34 |     clock_t t0 = clock();
35 |     uint64_t recs_read = 0;
36 |     int result = read_records(adapter, 10000000, 1, (char *)data, &recs_read);
37 |     clock_t t1 = clock();
38 | 
39 |     assert(result == ADAPTER_SUCCESS);
40 | 
41 |     printf("PASSED: read %llu records in %.2lf seconds\n", recs_read, (double)(t1-t0) / (double)CLOCKS_PER_SEC);
42 | 
43 |     free(data);
44 |     close_text_adapter(adapter);
45 | }
46 | 


--------------------------------------------------------------------------------