├── .gitattributes ├── .gitignore ├── LICENSE.txt ├── README.md ├── TEXTADAPTER_DEV.md ├── buildscripts ├── condarecipe │ ├── bld.bat │ ├── build.sh │ ├── meta.yaml │ └── run_test.py ├── fetch-dependencies └── jenkins-build ├── docs ├── Makefile ├── TextAdapter.rst ├── conf.py ├── eula.rst ├── genfromtxt.rst ├── index.rst ├── install.rst ├── loadtxt.rst ├── make.bat ├── release-notes.rst └── textadapter_examples.rst ├── environment.yml ├── setup.py ├── setupegg.py ├── textadapter ├── __init__.py ├── _version.py ├── core │ ├── IO.pyx │ ├── Index.pyx │ ├── TextAdapter.pxd │ ├── TextAdapter.pyx │ ├── __init__.py │ ├── genfromtxt.py │ ├── index.h │ ├── io.h │ ├── io_functions.c │ ├── io_functions.h │ ├── json_tokenizer.c │ ├── json_tokenizer.h │ ├── loadtxt.py │ ├── text_adapter.c │ └── text_adapter.h ├── examples │ ├── README │ ├── basic.py │ ├── converter.py │ ├── fixed_width.py │ ├── gzip_ints.py │ ├── missing_values.py │ └── regex.py ├── lib │ ├── Converters.pyx │ ├── __init__.py │ ├── _stdint.h │ ├── converter_functions.c │ ├── converter_functions.h │ ├── errors.py │ ├── field_info.c │ ├── field_info.h │ ├── khash.h │ ├── kstring.c │ ├── kstring.h │ └── kvec.h └── tests │ ├── Makefile │ ├── __init__.py │ ├── conftest.py │ ├── data │ └── benchmarks.py │ ├── generate.py │ ├── test_TextAdapter.py │ ├── test_ints.c │ ├── test_io.py │ └── test_text_adapter.c └── versioneer.py /.gitattributes: -------------------------------------------------------------------------------- 1 | textadapter/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Editor temporary/working/backup files # 2 | ######################################### 3 | .#* 4 | [#]*# 5 | *~ 6 | *$ 7 | *.bak 8 | *.diff 9 | *.org 10 | .project 11 | *.rej 12 | .settings/ 13 | .*.sw[nop] 14 | .sw[nop] 15 | *.tmp 16 | 17 | # Compiled source # 18 | ################### 19 | *.a 20 | *.com 21 | *.class 22 | *.dll 23 | *.exe 24 | *.o 25 | *.py[ocd] 26 | *.so 27 | 28 | # Python files # 29 | ################ 30 | # setup.py working directory 31 | build 32 | # sphinx build directory 33 | _build 34 | # setup.py dist directory 35 | dist 36 | doc/build 37 | doc/cdoc/build 38 | # Egg metadata 39 | *.egg-info 40 | # The shelf plugin uses this dir 41 | ./.shelf 42 | 43 | # Patches # 44 | ########### 45 | *.patch 46 | *.diff 47 | 48 | # OS generated files # 49 | ###################### 50 | .DS_Store* 51 | .VolumeIcon.icns 52 | .fseventsd 53 | Icon? 54 | .gdb_history 55 | ehthumbs.db 56 | Thumbs.db 57 | 58 | # Specific cython generated c files 59 | ###################### 60 | textadapter/core/TextAdapter.c 61 | 62 | # Generated data files for /tests and /examples 63 | textadapter/tests/data/fixedwidths 64 | textadapter/tests/data/floats 65 | textadapter/tests/data/ints 66 | textadapter/tests/data/ints.gz 67 | textadapter/tests/data/missingvalues 68 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2009-2016, Continuum Analytics, Inc. and contributors All 2 | rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | Neither the name of Continuum Analytics nor the names of any contributors 15 | may be used to endorse or promote products derived from this software 16 | without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 22 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 28 | THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Working Efficiently with Big Data in Text Formats Using Free Software 2 | 3 | One of our first commercial software products at Continuum Analytics was a product called *IOPro* which we have sold continuously since 2009. Now we are releasing the code under a liberal open source license. 4 | 5 | Following the path of widely adopted projects like conda, Blaze, Dask, odo, Numba, Conda, Bokeh, datashader, DataShape, DyND and other software that Continuum has created, we hope that the code in IOPro becomes valuable to open source communities and data scientists. 6 | 7 | We don't only hope this code is useful to you, however, we also hope you—or your colleagues—will be able to enhance, to refine and to develop the code further to increase its utility for the entire Python world. 8 | 9 | ## What IOPro does 10 | 11 | IOPro loads NumPy arrays and Pandas DataFrames directly from files, SQL databases and NoSQL stores–including ones with millions (or billions) of rows. It provides a drop-in replacement for NumPy data loading functions but dramatically improves performance and starkly reduces memory overhead. 12 | 13 | The key concept in our code is that we access data via *adapters* which are something like enhanced file handles or database cursors. An adapter does not read data directly into memory, but rather provides a mechanism to use familiar NumPy/Pandas slicing syntax to load manageable segments of a large dataset. Moreover, an adapter provides fine-grained control over exactly *how* data is eventually read into memory, whether using custom patterns for how a line of data is parsed, choosing the precise data type of a textually represented number, or exposing data as "calculated fields" (that is, "virtual columns"). 14 | 15 | As well as local CSV, JSON or other textual data sources, IOPro can load data from Amazon S3 buckets. When accessing large datasets—especially ones too large to load into memory—from files that do not have fixed record sizes, IOPro's indexing feature allows users to seek to a specific collection of records tens, hundreds or thousands of times faster than is possible with a linear scan. 16 | 17 | ## Our release schedule 18 | 19 | The intial release of our open source code will be of the TextAdapter component that makes up the better part of the code in IOPro. This code will be renamed—straightforwardly enough—as **TextAdapter**. The project will live at https://github.com/ContinuumIO/TextAdapter. We will make this forked project available by October 15, 2016 under a BSD 3-Clause License. 20 | 21 | Continuum is evaluating the details of our release of the database adapters, but will definitely make the code (though possibly unrefined) available by December 31, 2016. Our main hesitations with releasing the database adapters is that the state of the art in Python database adapters has advanced considerably since 2009, and we do not want to advocate a codebase unless it is currently best-of-breed (at very least for some niche use case). At worst, we will still release the code as an historical artifact. Those projects will live at https://github.com/ContinuumIO/DBAdapter, https://github.com/ContinuumIO/PostgresAdapter, https://github.com/ContinuumIO/AccumuloAdapter, and https://github.com/ContinuumIO/MongoAdapter. 22 | 23 | If you are a current paid customer of IOPro, and are due for renewal before January 1, 2017, your sales rep will get in touch with you for renewal arrangements. We will continue to monitor and reply to issues and discussion about these successor projects at their GitHub repositories. 24 | 25 | Thank you to prior contributors at Continuum, especially Jay Bourque (jayvius), but notably also Francesc Alted (FrancescAlted), Óscar Villellas Guillén (ovillellas), Michael Kleehammer (mkleehammer) and Ilan Schnell (ilanschnell) for their wonderful contributions. Any remaining bugs are my responsibility alone as current maintainer of the project. 26 | 27 | ## The Blaze ecosystem 28 | 29 | As part of the open source release of TextAdapter, we plan to integrate TextAdapter into the Blaze ecosystem. Blaze itself, as well as odo, provide translation between data formats and querying of data within a large variety of formats. Putting TextAdapter clearly in this ecosystem will let an *adapter* act as one such data format, and hence leverage the indexing speedups and data massaging that TextAdapter provides. 30 | 31 | 32 | 33 | 34 | ## TextAdapter 35 | 36 | TextAdapter is a Python module containing optimized data adapters for 37 | importing data from a variety of data sources into NumPy arrays and Pandas 38 | DataFrame. Current data adapters include TextAdapter for JSON, free-form, 39 | and CSV-like text files. DBAdapter, also based on IOPro, accesses 40 | MongoAdapter for mongo databases, PostgresAdapter for PostgreSQL databases, 41 | AccumuloAdapter for Accumulo databases, and an optimized pyodbc module for 42 | accessing any relational database that supports the ODBC interface (SQL 43 | Server, PostgreSQL, MySQL, etc). 44 | 45 | ## Build Requirements 46 | 47 | Building TextAdapter requires a number of dependencies. In addition to a 48 | C/C++ dev environment, the following modules are needed, which can be 49 | installed via conda. 50 | 51 | * NumPy 52 | * Pandas 53 | * zlib 1.2.8 (C lib) 54 | * pcre 8.31 (C lib) 55 | 56 | ## Building Conda Package 57 | 58 | Note: If building under Windows, make sure the following commands are issued 59 | within the Visual Studio command prompt for version of Visual Studio that 60 | matches the version of Python you're building for. Python 2.6 and 2.7 needs 61 | Visual Studio 2008, Python 3.3 and 3.4 needs Visual Studio 2010, and Python 62 | 3.5 needs Visual Studio 2015. 63 | 64 | 1. Build TextAdapter using the following command: 65 | 66 | ``` 67 | conda build buildscripts/condarecipe --python 3.5 68 | ``` 69 | 70 | 1. TextAdapter can now be installed from the built conda package: 71 | 72 | ``` 73 | conda install textadapter --use-local 74 | ``` 75 | 76 | ## Building By Hand 77 | 78 | Note: If building under Windows, make sure the following commands are issued 79 | within the Visual Studio command prompt for version of Visual Studio that 80 | matches the version of Python you're building for. Python 2.6 and 2.7 needs 81 | Visual Studio 2008, Python 3.3 and 3.4 needs Visual Studio 2010, and Python 82 | 3.5 needs Visual Studio 2015. 83 | 84 | For building TextAdapter for local development/testing: 85 | 86 | 1. Install most of the above dependencies into environment called 87 | 'textadapter': 88 | 89 | ``` 90 | conda env create -f environment.yml 91 | ``` 92 | 93 | Be sure to activate new TextAdapter environment before proceeding. 94 | 95 | 96 | 1. Build TextAdapter using Cython/distutils: 97 | 98 | ``` 99 | python setup.py build_ext --inplace 100 | ``` 101 | 102 | ## Testing 103 | 104 | Tests can be run by calling the iopro module's test function. By default 105 | only the TextAdapter tests will be run: 106 | 107 | ```python 108 | python -Wignore -c 'import textadapter; textadapter.test()' 109 | ``` 110 | 111 | (Note: `numpy.testing` might produce a FurtureWarning that is not directly 112 | relevant to these unit tests) 113 | 114 | 115 | Related projects 116 | ---------------- 117 | 118 | - DBAdapter (SQL derivatives): https://github.com/ContinuumIO/DBAdapter 119 | - PostgresAdapter (PostgreSQL): https://github.com/ContinuumIO/PostgresAdapter 120 | - AccumuloAdapter (Apache Accumulo): https://github.com/ContinuumIO/AccumuloAdapter 121 | - MongoAdapter (MongoDB): https://github.com/ContinuumIO/MongoAdapter 122 | 123 | 124 | ## Other open source tools 125 | 126 | Other open source projects for interacting with large datasets provide either competitors or collaborative capabilities. 127 | 128 | * The **ParaText** from Wise Technology looks like a very promising approach to accelerating raw reads of CSV data. It doesn't currently provide regular expression matching nor as rich data typing as IOPro, but the raw reads are shockingly fast. Most importantly, perhaps, ParaText does not address indexing, so as fast as it is at linear scan, it remains stuck with big-O inefficiencies that TextAdapter addresses. I personally think that (optionally) utilizing the underlying reader of ParaText as a layer underneath TextAdapter would be a wonderful combination. Information about ParaText can be found at http://www.wise.io/tech/paratext 129 | 130 | Database access is almost always I/O bound rather than CPU bound, and hence the likely wins are by switching to asynchronous frameworks. This *does* involve using a somewhat different programming style than synchronous adapters, but some recent ones look amazingly fast. I am not yet sure whether it is worthwhile to create IOPro style adapters around these `asyncio`-based interfaces. 131 | 132 | * **asyncpg** is a database interface library designed specifically for PostgreSQL and Python/asyncio. asyncpg is an efficient, clean implementation of PostgreSQL server binary protocol. Information about asyncpg can be found at https://magicstack.github.io/asyncpg/current/. 133 | 134 | * **Motor** presents a callback- or Future-based API for non-blocking access to MongoDB from Tornado or asyncio. Information about Motor can be found at http://motor.readthedocs.io/en/stable/. 135 | -------------------------------------------------------------------------------- /TEXTADAPTER_DEV.md: -------------------------------------------------------------------------------- 1 | Notes on the Development and Design of the TextAdapter Module 2 | ============================================================= 3 | 4 | The TextAdapter module was the first and most complicated IOPro data 5 | adapter. The rest of the data adapters loosely follow the design of the 6 | TextAdapter module described below. 7 | 8 | Key Ideas 9 | --------- 10 | 11 | The TextAdapter module supports parsing tab delimited text, text with fixed 12 | width fields, JSON text, and text whose fields can be desribed with regular 13 | expressions. 14 | 15 | The guts of the TextAdapter module are written in C, with a Python interface 16 | implemented in Cython. 17 | 18 | The IOPro interface for the data adapters (TextAdapter, MongoAdapter, 19 | PostgresAdapter, and AccumuloAdapter) are designed to be numpy array-like in 20 | that slicing on the adapter is used to retrieve subsets of data. When the 21 | adapter object is first created, no data is actually read (except for a few 22 | records at the beginning of the input data to determine field types, number 23 | of fields, etc). 24 | 25 | IOPro is generally optimized for memory usage over speed, although speed is 26 | definitely a primary goal too. Data copying is kept to a minimum so that as 27 | much data as possible can be read into a numpy array. 28 | 29 | A TextAdapter object contains an array of function pointers, one for each 30 | field, that point to conversion functions that are responsible for 31 | converting input text data to the final output value. 32 | 33 | A TextAdapter object also contains a set of function pointers to IO related 34 | functions (open, seek, read, and close) reponsible for reading data from the 35 | data source. Compressed data seek and read functions can also be set if 36 | source data is compressed. By combining normal IO function pointers with 37 | compressed data seek/read function pointers, the TextAdapter module can 38 | easily handle any supported data source that is also compressed with one of 39 | the supported compression schemes (currently only gzip). 40 | 41 | A TextAdapter object also contains a function pointer that points to a 42 | tokenizer function appropriate for the input text type. The tokenizer 43 | function is responsible for parsing the input text data and calling 44 | process_token to convert text data into the final output data type for the 45 | current field. Each text type has a tokenizer function. Tokenizer 46 | functions are also implemented for parsing lines and records as single 47 | string values (a record can be multiple lines). 48 | 49 | Key Low Level C Data Structures 50 | ======= 51 | 52 | TextAdapter (textadapter/core/text_adapter.h): 53 | 54 | Core struct for text parser. Contains attributes for input text such as 55 | delimiter character, comment character, etc. tokenize field is a function 56 | pointer to the tokenize function for parsing specific type of text (tab 57 | delimited, fixed width, etc). Also contains pointers to InputDatastruct 58 | and TextAdapterBuffer described below. 59 | 60 | InputData (textadapter/core/text_adapter.h): 61 | 62 | Contains function pointers for IO functions (open, read, seek, close) and 63 | for compressed data read and seek functions. Also contains a void *input 64 | field for storing a data structure specific to each data source (C FILE 65 | pointer, S3 bucket info, etc). 66 | 67 | TextAdapterBuffer (textadapter/core/text_adapter.h): 68 | 69 | Main buffer for storing text data to be parsed. 70 | 71 | Ideas for Future Optimizations 72 | ======= 73 | 74 | - The biggest performance gains could be had by incorporating some parallel 75 | processing goodness. The most natural way to split it up (this should 76 | work for all the adapters) might be to have one thread/process that reads 77 | the input data into the main buffer, and a second thread/process do the 78 | actual parsing and converting of the data, and storing of the converted 79 | data in the final numpy array. 80 | 81 | - Another idea for a potential speedup might be to refactor the parsing 82 | backend so that offsets for all the tokens for a field in the buffer are 83 | returned, and then have separate loops for different field types, that 84 | would power through all the tokens for a field and call the appropriate 85 | conversion function (the key would be to decide outside of the loops which 86 | loop+conversion function to execute, so that the conversion function would 87 | be inlined inside each loop. This is essentially how the Pandas CSV 88 | reader works, but it would increase memory usage. For example (in 89 | python-like pseudocode but implemented at the C level: 90 | 91 | ``` 92 | if field_type is integers: 93 | for i in range(num_records): 94 | convert_and_store_ints(field_token_offsets[i]) 95 | elif field_type is floats: 96 | for i in range(num_records): 97 | convert_and_store_floats(field_token_offsets[i]) 98 | ``` 99 | -------------------------------------------------------------------------------- /buildscripts/condarecipe/bld.bat: -------------------------------------------------------------------------------- 1 | %PYTHON% setup.py install 2 | if errorlevel 1 exit 1 3 | -------------------------------------------------------------------------------- /buildscripts/condarecipe/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | $PYTHON setup.py install 4 | -------------------------------------------------------------------------------- /buildscripts/condarecipe/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: textadapter 3 | version: "2.0.0" 4 | 5 | source: 6 | path: ../../ 7 | 8 | build: 9 | number: 0 10 | 11 | requirements: 12 | build: 13 | - python 14 | - numpy 15 | - cython 16 | 17 | # zlib and pcre versions pinned to fix shared library issues 18 | - zlib 1.2.8 19 | - pcre 8.31 20 | 21 | run: 22 | - python 23 | - numpy 24 | - pandas 25 | - six 26 | - ordereddict [py26] 27 | 28 | test: 29 | requires: 30 | - nose 31 | - pytest 32 | 33 | imports: 34 | - textadapter 35 | - textadapter.core.TextAdapter 36 | 37 | about: 38 | home: https://github.com/ContinuumIO/TextAdapter 39 | license: BSD 40 | summary: python interface Amazon S3, and large data files 41 | -------------------------------------------------------------------------------- /buildscripts/condarecipe/run_test.py: -------------------------------------------------------------------------------- 1 | import textadapter 2 | 3 | assert textadapter.test() 4 | 5 | print('textadapter.__version__: %s' % textadapter.__version__) 6 | -------------------------------------------------------------------------------- /buildscripts/fetch-dependencies: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | IOPRO_INCLUDE_PATH=$HOME/anaconda/envs/iopro-test-np1.7/include/python2.7 5 | 6 | WHERE="$HOME/dependencies/" 7 | 8 | function mk_depend_path() 9 | { 10 | echo $WHERE$1 11 | } 12 | 13 | if [ ! -d $(mk_depend_path '') ]; then 14 | mkdir $(mk_depend_path '') 15 | fi 16 | 17 | pushd $(mk_depend_path '') >/dev/null 18 | 19 | 20 | # pcre version 8.30 21 | if [ ! -d pcre-8.30 ]; then 22 | if [ ! -f pcre-8.30.tar.gz ]; then 23 | wget ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-8.30.tar.gz 24 | fi 25 | 26 | tar -zxvf pcre-8.30.tar.gz 27 | pushd pcre-8.30 >/dev/null 28 | ./configure 29 | make 30 | popd >/dev/null 31 | fi 32 | 33 | if [ -f pcre-8.30.tar.gz ]; then 34 | # leave it clean 35 | rm pcre-8.30.tar.gz 36 | fi 37 | 38 | 39 | # zlib-1.2.7 40 | if [ ! -d zlib-1.2.7 ]; then 41 | if [ ! -f zlib-1.2.7.tar.bz2 ]; then 42 | wget http://downloads.sourceforge.net/project/libpng/zlib/1.2.7/zlib-1.2.7.tar.bz2 43 | fi 44 | tar -jxvf zlib-1.2.7.tar.bz2 45 | 46 | pushd zlib-1.2.7 >/dev/null 47 | ./configure 48 | make 49 | popd >/dev/null 50 | fi 51 | 52 | if [ -f zlib-1.2.7.tar.bz2 ]; then 53 | #leave it clean 54 | rm zlib-1.2.7.tar.bz2 55 | fi 56 | 57 | 58 | IOPRO_INCLUDE_PATH=$(mk_depend_path pcre-8.30):$IOPRO_INCLUDE_PATH 59 | IOPRO_INCLUDE_PATH=$(mk_depend_path zlib-1.2.7):$IOPRO_INCLUDE_PATH 60 | export IOPRO_INCLUDE_PATH 61 | 62 | echo 'IOPRO_INCLUDE_PATH=' $IOPRO_INCLUDE_PATH 63 | 64 | IOPRO_LIBRARY_PATH=$(mk_depend_path pcre-8.30/.libs):$IOPRO_LIBRARY_PATH 65 | IOPRO_LIBRARY_PATH=$(mk_depend_path zlib-1.2.7):$IOPRO_LIBRARY_PATH 66 | export IOPRO_LIBRARY_PATH 67 | 68 | echo 'IOPRO_LIBRARY_PATH=' $IOPRO_LIBRARY_PATH 69 | 70 | LD_LIBRARY_PATH=$(mk_depend_path pcre-8.30/.libs):$LD_LIBRARY_PATH 71 | LD_LIBRARY_PATH=$(mk_depend_path zlib-1.2.7):$LD_LIBRARY_PATH 72 | export LD_LIBRARY_PATH 73 | 74 | echo 'LD_LIBRARY_PATH=' $LD_LIBRARY_PATH 75 | 76 | popd >/dev/null 77 | 78 | printf '\n\nBuilding...\n' 79 | python setup.py build_ext --inplace --include-dirs=$IOPRO_INCLUDE_PATH --library-dirs=$IOPRO_LIBRARY_PATH || exit 1 80 | 81 | exit 82 | 83 | 84 | -------------------------------------------------------------------------------- /buildscripts/jenkins-build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PYTHON_VERSION=2.7 4 | 5 | if [ "${PYTHON_VERSION}" == "" ]; then 6 | echo You must select a Python version with the PYTHON_VERSION variable. 7 | exit 1 8 | fi 9 | 10 | # Start from scratch 11 | if [ -d build ]; then 12 | rm -rf build 13 | fi 14 | mkdir build 15 | cd build 16 | 17 | # Use conda to create a conda environment of the required 18 | # python version and containing the dependencies. 19 | export PYENV_PREFIX=${WORKSPACE}/build/pyenv 20 | rm -rf ${PYENV_PREFIX} 21 | ~/anaconda/bin/conda create --yes -p ${PYENV_PREFIX} anaconda python=${PYTHON_VERSION} numpy=1.7 || exit 1 22 | export PATH=${PYENV_PREFIX}/bin:${PATH} 23 | 24 | # JNB: Get rid of any iopro that conda may have installed 25 | rm -rf ${PYENV_PREFIX}/lib/python2.7/site-packages/iopro* 26 | 27 | # Get and build pcre lib 28 | if [ ! -f ${WORKSPACE}/pcre-8.30.tar.gz ]; then 29 | cd .. 30 | wget ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-8.30.tar.gz 31 | cd build 32 | fi 33 | 34 | tar -zxvf ../pcre-8.30.tar.gz 35 | cd pcre-8.30 36 | ./configure 37 | make 38 | cd .. 39 | 40 | # Get and build gzip compression lib 41 | if [ ! -f ${WORKSPACE}/zlib-1.2.7.tar.bz2 ]; then 42 | cd .. 43 | wget http://downloads.sourceforge.net/project/libpng/zlib/1.2.7/zlib-1.2.7.tar.bz2 44 | cd build 45 | fi 46 | 47 | tar -jxvf ../zlib-1.2.7.tar.bz2 48 | cd zlib-1.2.7 49 | ./configure 50 | make 51 | cd .. 52 | 53 | 54 | # Set up include and lib paths since we're not installing in default system paths 55 | export IOPRO_INCLUDE_PATH=${WORKSPACE}/build/pcre-8.30:$IOPRO_INCLUDE_PATH 56 | export IOPRO_LIBRARY_PATH=${WORKSPACE}/build/pcre-8.30/.libs:$IOPRO_LIBRARY_PATH 57 | export IOPRO_INCLUDE_PATH=${WORKSPACE}/build/zlib-1.2.7:$IOPRO_INCLUDE_PATH 58 | export IOPRO_LIBRARY_PATH=${WORKSPACE}/build/zlib-1.2.7:$IOPRO_LIBRARY_PATH 59 | export IOPRO_INCLUDE_PATH=~/anaconda/include/python${PYTHON_VERSION}:$IOPRO_INCLUDE_PATH 60 | export IOPRO_LIBRARY_PATH=~/anaconda/lib:$IOPRO_LIBRARY_PATH 61 | 62 | export LD_LIBRARY_PATH=${WORKSPACE}/build/pcre-8.30/.libs:$LD_LIBRARY_PATH 63 | export LD_LIBRARY_PATH=${WORKSPACE}/build/zlib-1.2.7:$LD_LIBRARY_PATH 64 | 65 | cd .. 66 | python setup.py build_ext --inplace --include-dirs=$IOPRO_INCLUDE_PATH --library-dirs=$IOPRO_LIBRARY_PATH || exit 1 67 | python -c 'import textadapter; import sys; sys.exit(1 - textadapter.test(num_records=1000))' 68 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/IOPro.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/IOPro.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/IOPro" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/IOPro" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/TextAdapter.rst: -------------------------------------------------------------------------------- 1 | ----------- 2 | TextAdapter 3 | ----------- 4 | 5 | .. contents:: 6 | 7 | The TextAdapter module reads CSV data and produces a NumPy array containing the 8 | parsed data. The following features are currently implemented: 9 | 10 | * The TextAdapter engine is written 11 | in C to ensure text is parsed as fast as data can be read from the source. 12 | Text is read and parsed in small chunks instead of reading entire data into 13 | memory at once, which enables very large files to be read and parsed without 14 | running out of memory. 15 | 16 | * Python slicing notation can be used to specify a subset of records to be 17 | read from the data source, as well as a subset of fields. 18 | 19 | * Fields can be specified in any one of three ways: by a delimiter character, 20 | using fixed field widths, or by a regular expression. This enables a larger 21 | variety of CSV-like and other types of text files to be parsed. 22 | 23 | * A gzipped file can be parsed without having to uncompress it first. Parsing speed 24 | is about the same as an uncompressed version of same file. 25 | 26 | * An index of record offsets in a file can be built to allow fast random access to 27 | records. This index can be saved to disk and loaded again later. 28 | 29 | * Converter functions can be specified for converting parsed text to proper dtype 30 | for storing in NumPy array. 31 | 32 | * The TextAdapter engine has automatic type inference so the user does not have to 33 | specify dtypes of the output array. The user can still specify dtypes manually if 34 | desired. 35 | 36 | * Remote data stored in Amazon S3 can be read. An index can be built and stored 37 | with S3 data. Index can be read remotely, allowing for random access to S3 data. 38 | 39 | Methods 40 | ------- 41 | The TextAdapter module contains the following factory methods for creating TextAdapter objects: 42 | 43 | **text_adapter** (source, parser='csv', compression=None, comment='#', 44 | quote='"', num_records=0, header=0, field_names=True, 45 | indexing=False, index_name=None, encoding='utf-8') 46 | 47 | | Create a text adapter for reading CSV, JSON, or fixed width 48 | | text files, or a text file defined by regular expressions. 49 | 50 | | source - filename, file object, StringIO object, BytesIO object, S3 key, 51 | http url, or python generator 52 | | parser - Type of parser for parsing text. Valid parser types are 'csv', 'fixed width', 'regex', and 'json'. 53 | | encoding - type of character encoding (current ascii and utf8 are supported) 54 | | compression - type of data compression (currently only gzip is supported) 55 | | comment - character used to indicate comment line 56 | | quote - character used to quote fields 57 | | num_records - limits parsing to specified number of records; defaults 58 | to all records 59 | | header - number of lines in file header; these lines are skipped when parsing 60 | | footer - number of lines in file footer; these lines are skipped when parsing 61 | | indexing - create record index on the fly as characters are read 62 | | index_name - name of file to write index to 63 | | output - type of output object (numpy array or pandas dataframe) 64 | 65 | 66 | If parser is set to 'csv', additional parameters include: 67 | | delimiter - Delimiter character used to define fields in data source. Default is ','. 68 | 69 | If parser is set to 'fixed_width', additional parameters include: 70 | | field_widths - List of field widths 71 | 72 | If parser is set to 'regex', additional parameters include: 73 | | regex - Regular expression used to define records and fields in data source. 74 | See the regular expression example in the Advanced Usage section. 75 | 76 | **s3_text_adapter** (access_key, secret_key, bucket_name, key_name, remote_s3_index=False) 77 | parser='csv', compression=None, comment='#', 78 | quote='"', num_records=0, header=0, field_names=True, 79 | indexing=False, index_name=None, encoding='utf-8') 80 | 81 | | Create a text adapter for reading a text file from S3. Text file can be 82 | | CSV, JSON, fixed width, or defined by regular expressions 83 | 84 | In addition to the arguments described for the text_adapter function above, 85 | the s3_text_adapter function also has the following parameters: 86 | 87 | | access_key - AWS access key 88 | | secret_key - AWS secret key 89 | | bucket_name - name of S3 bucket 90 | | key_name - name of key in S3 bucket 91 | | remote_s3_index - use remote S3 index (index name must be key name + '.idx' extension) 92 | 93 | 94 | The TextAdapter object returned by the text_adapter factory method contains the following methods: 95 | 96 | **set_converter** (field, converter) 97 | | Set converter function for field 98 | 99 | | field - field to apply converter function 100 | | converter - python function object 101 | 102 | **set_missing_values** (missing_values) 103 | | Set strings for each field that represents a missing value 104 | 105 | | missing_values - dict of field name or number, 106 | and list of missing value strings 107 | 108 | Default missing values: 'NA', 'NaN', 'inf', '-inf', 'None', 'none', '' 109 | 110 | **set_fill_values** (fill_values, loose=False) 111 | | Set fill values for each field 112 | 113 | | fill_values - dict of field name or number, and fill value 114 | | loose - If value cannot be converted, and value does not match 115 | any of the missing values, replace with fill value anyway. 116 | 117 | Default fill values for each data type: 118 | | int - 0 119 | | float - numpy.nan 120 | | char - 0 121 | | bool - False 122 | | object - numpy.nan 123 | | string - numpy.nan 124 | 125 | **create_index** (index_name=None, density=1) 126 | | Create an index of record offsets in file 127 | 128 | | index_name - Name of file on disk used to store index. If None, index 129 | will be created in memory but not saved. 130 | | density - density of index. Value of 1 will index every record, value of 131 | 2 will index every other record, etc. 132 | 133 | **to_array** () 134 | | Parses entire data source and returns data as NumPy array object 135 | 136 | **to_dataframe** () 137 | | Parses entire data source and returns data as Pandas DataFrame object 138 | 139 | The TextAdapter object contains the following properties: 140 | 141 | **size** (readonly) 142 | | Number of records in data source. This value is only set if entire data 143 | source has been read or indexed, or number of recods was specified in 144 | text_adapter factory method when creating object. 145 | 146 | **field_count** (readonly) 147 | | Number of fields in each record 148 | 149 | **field_names** 150 | | Field names to use when creating output NumPy array. Field names can be 151 | set here before reading data or in text_adapter function with 152 | field_names parameter. 153 | 154 | **field_types** 155 | | NumPy dtypes for each field, specified as a dict of fields and associated 156 | dtype. (Example: {0:'u4', 1:'f8', 2:'S10'}) 157 | 158 | **field_filter** 159 | | Fields in data source to parse, specified as a list of field numbers 160 | or names (Examples: [0, 1, 2] or ['f1', 'f3', 'f5']). This filter stays 161 | in effect until it is reset to empty list, or is overridden with array 162 | slicing (Example: adapter[[0, 1, 3, 4]][:]). 163 | 164 | See the NumPy data types documentation for more details: 165 | http://docs.continuum.io/anaconda/numpy/reference/arrays.dtypes.html 166 | 167 | The TextAdapter object supports array slicing: 168 | 169 | | Read all records: 170 | adapter[:] 171 | 172 | | Read first 100 records: 173 | adapter[0:100] 174 | 175 | | Read last record (only if data has been indexed or entire dataset 176 | has been read once before): 177 | adapter[-1] 178 | 179 | | Read first field in all records by specifying field number: 180 | adapter[0][:] 181 | 182 | | Read first field in all records by specifying field name: 183 | adapter['f0'][:] 184 | 185 | | Read first and third fields in all records: 186 | adapter[[0, 2]][:] 187 | 188 | Basic Usage 189 | ----------- 190 | 191 | Create TextAdapter object for data source:: 192 | 193 | >>> import iopro 194 | >>> adapter = iopro.text_adapter('data.csv', parser='csv') 195 | 196 | Parse text and store records in NumPy array using slicing notation:: 197 | 198 | >>> # read all records 199 | >>> array = adapter[:] 200 | 201 | >>> # read first ten records 202 | >>> array = adapter[0:10] 203 | 204 | >>> # read last record 205 | >>> array = adapter[-1] 206 | 207 | >>> # read every other record 208 | >>> array = adapter[::2] 209 | 210 | Advanced Usage 211 | -------------- 212 | 213 | user defined converter function for field 0:: 214 | 215 | >>> import iopro 216 | >>> import io 217 | 218 | >>> data = '1, abc, 3.3\n2, xxx, 9.9' 219 | >>> adapter = iopro.text_adapter(io.StringIO(data), parser='csv', field_names=False) 220 | 221 | >>> # Override default converter for first field 222 | >>> adapter.set_converter(0, lambda x: int(x)*2) 223 | >>> adapter[:] 224 | array([(2L, ' abc', 3.3), (4L, ' xxx', 9.9)], 225 | dtype=[('f0', '>> import iopro 230 | >>> import io 231 | 232 | >>> data = '1,abc,inf\n2,NA,9.9' 233 | >>> adapter = iopro.text_adapter(io.StringIO(data), parser='csv', field_names=False) 234 | 235 | >>> # Define field dtypes (example: set field 1 to string object and field 2 to float) 236 | >>> adapter.field_types = {1:'O', 2:'f4'} 237 | 238 | >>> # Define list of strings for each field that represent missing values 239 | >>> adapter.set_missing_values({1:['NA'], 2:['inf']}) 240 | 241 | >>> # Set fill value for missing values in each field 242 | >>> adapter.set_fill_values({1:'xxx', 2:999.999}) 243 | >>> adapter[:] 244 | array([(' abc', 999.9990234375), ('xxx', 9.899999618530273)], 245 | dtype=[('f0', 'O'), ('f1', '>> import iopro 250 | >>> adapter = iopro.text_adapter('data.gz', parser='csv', compression='gzip') 251 | 252 | >>> # Build index of records and save index to disk. 253 | >>> adapter.create_index(index_name='index_file') 254 | 255 | >>> # Create new adapter object and load index from disk. 256 | >>> adapter = iopro.text_adapter('data.gz', parser='csv', compression='gzip', indexing=True, index_name='index_file') 257 | 258 | >>> # Read last record 259 | >>> adapter[-1] 260 | array([(100, 101, 102)],dtype=[('f0', '>> import iopro 265 | >>> import io 266 | 267 | >>> # Define regular expression to extract dollar amount, percentage, and month. 268 | >>> # Each set of parentheses defines a field. 269 | >>> data = '$2.56, 50%, September 20 1978\n$1.23, 23%, April 5 1981' 270 | >>> regex_string = '([0-9]\.[0-9][0-9]+)\,\s ([0-9]+)\%\,\s ([A-Za-z]+)' 271 | >>> adapter = iopro.text_adapter(io.StringIO(data), parser='regex', regex_string=regex_string, field_names=False, infer_types=False) 272 | 273 | >>> # set dtype of field to float 274 | >>> adapter.field_types = {0:'f4', 1:'u4', 2:'S10'} 275 | >>> adapter[:] 276 | array([(2.56, 50L, 'September'), (1.23, 23L, 'April')], 277 | dtype=[('f0', ' v documentation". 106 | #html_title = None 107 | 108 | # A shorter title for the navigation bar. Default is the same as html_title. 109 | #html_short_title = None 110 | 111 | # The name of an image file (relative to this directory) to place at the top 112 | # of the sidebar. 113 | #html_logo = None 114 | 115 | # The name of an image file (within the static path) to use as favicon of the 116 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 117 | # pixels large. 118 | #html_favicon = None 119 | 120 | # Add any paths that contain custom static files (such as style sheets) here, 121 | # relative to this directory. They are copied after the builtin static files, 122 | # so a file named "default.css" will overwrite the builtin "default.css". 123 | html_static_path = ['_static'] 124 | 125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 126 | # using the given strftime format. 127 | #html_last_updated_fmt = '%b %d, %Y' 128 | 129 | # If true, SmartyPants will be used to convert quotes and dashes to 130 | # typographically correct entities. 131 | #html_use_smartypants = True 132 | 133 | # Custom sidebar templates, maps document names to template names. 134 | #html_sidebars = {} 135 | 136 | # Additional templates that should be rendered to pages, maps page names to 137 | # template names. 138 | #html_additional_pages = {} 139 | 140 | # If false, no module index is generated. 141 | #html_domain_indices = True 142 | 143 | # If false, no index is generated. 144 | #html_use_index = True 145 | 146 | # If true, the index is split into individual pages for each letter. 147 | #html_split_index = False 148 | 149 | # If true, links to the reST sources are added to the pages. 150 | #html_show_sourcelink = True 151 | 152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 153 | #html_show_sphinx = True 154 | 155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 156 | #html_show_copyright = True 157 | 158 | # If true, an OpenSearch description file will be output, and all pages will 159 | # contain a tag referring to it. The value of this option must be the 160 | # base URL from which the finished HTML is served. 161 | #html_use_opensearch = '' 162 | 163 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 164 | #html_file_suffix = None 165 | 166 | # Output file base name for HTML help builder. 167 | htmlhelp_basename = 'IOProdoc' 168 | 169 | 170 | # -- Options for LaTeX output -------------------------------------------------- 171 | 172 | latex_elements = { 173 | # The paper size ('letterpaper' or 'a4paper'). 174 | #'papersize': 'letterpaper', 175 | 176 | # The font size ('10pt', '11pt' or '12pt'). 177 | #'pointsize': '10pt', 178 | 179 | # Additional stuff for the LaTeX preamble. 180 | #'preamble': '', 181 | } 182 | 183 | # Grouping the document tree into LaTeX files. List of tuples 184 | # (source start file, target name, title, author, documentclass [howto/manual]). 185 | latex_documents = [ 186 | ('index', 'IOPro.tex', u'IOPro Documentation', 187 | u'Continuum Analytics', 'manual'), 188 | ] 189 | 190 | # The name of an image file (relative to this directory) to place at the top of 191 | # the title page. 192 | #latex_logo = None 193 | 194 | # For "manual" documents, if this is true, then toplevel headings are parts, 195 | # not chapters. 196 | #latex_use_parts = False 197 | 198 | # If true, show page references after internal links. 199 | #latex_show_pagerefs = False 200 | 201 | # If true, show URL addresses after external links. 202 | #latex_show_urls = False 203 | 204 | # Documents to append as an appendix to all manuals. 205 | #latex_appendices = [] 206 | 207 | # If false, no module index is generated. 208 | #latex_domain_indices = True 209 | 210 | 211 | # -- Options for manual page output -------------------------------------------- 212 | 213 | # One entry per manual page. List of tuples 214 | # (source start file, name, description, authors, manual section). 215 | man_pages = [ 216 | ('index', 'iopro', u'IOPro Documentation', 217 | ['Continuum Analytics', 'Jay Bourque', 'David Mertz'], 1) 218 | ] 219 | 220 | # If true, show URL addresses after external links. 221 | #man_show_urls = False 222 | 223 | 224 | # -- Options for Texinfo output ------------------------------------------------ 225 | 226 | # Grouping the document tree into Texinfo files. List of tuples 227 | # (source start file, target name, title, author, 228 | # dir menu entry, description, category) 229 | texinfo_documents = [ 230 | ('index', 'IOPro', u'IOPro Documentation', 231 | u'Continuum Analytics', 'IOPro', 'One line description of project.', 232 | 'Miscellaneous'), 233 | ] 234 | 235 | # Documents to append as an appendix to all manuals. 236 | #texinfo_appendices = [] 237 | 238 | # If false, no module index is generated. 239 | #texinfo_domain_indices = True 240 | 241 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 242 | #texinfo_show_urls = 'footnote' 243 | -------------------------------------------------------------------------------- /docs/eula.rst: -------------------------------------------------------------------------------- 1 | ================================ 2 | IOPro END USER LICENSE AGREEMENT 3 | ================================ 4 | 5 | IOPro ("the Software Product") and accompanying documentation is licensed and 6 | not sold. The Software Product is protected by copyright laws and treaties, as 7 | well as laws and treaties related to other forms of intellectual property. 8 | Continuum Analytics, Inc. or its subsidiaries, affiliates, and suppliers 9 | (collectively "Continuum") own intellectual property rights in the Software 10 | Product. The Licensee's ("you" or "your") license to download, use, copy, or 11 | change the Software Product is subject to these rights and to all the terms 12 | and conditions of this End User License Agreement ("Agreement"). 13 | 14 | Acceptance 15 | ========== 16 | 17 | YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS AGREEMENT BY SELECTING 18 | THE "ACCEPT" OPTION AND DOWNLOADING THE SOFTWARE PRODUCT OR BY INSTALLING, 19 | USING, OR COPYING THE SOFTWARE PRODUCT. YOU MUST AGREE TO ALL OF THE TERMS OF 20 | THIS AGREEMENT BEFORE YOU WILL BE ALLOWED TO DOWNLOAD THE SOFTWARE PRODUCT. IF 21 | YOU DO NOT AGREE TO ALL OF THE TERMS OF THIS AGREEMENT, YOU MUST SELECT 22 | "DECLINE" AND YOU MUST NOT INSTALL, USE, OR COPY THE SOFTWARE PRODUCT. 23 | 24 | Trial Period 25 | ============ 26 | 27 | You have the right to use IOPro on a single computer or group of computers for 28 | 30 days with no license. After 30 days you must purchase an appropriate 29 | license to use the software on one or more machines or stop using the software 30 | and remove it from all of your machines on which you installed the software. 31 | 32 | License Grant 33 | ============= 34 | 35 | This Agreement entitles you to install and use one copy of the Software 36 | Product on as many machines as you will personally use. The Software Product 37 | is licensed to a particular user. Only the user to whom the software is 38 | licensed may use the software. You must obtain a license for as many users as 39 | you wish to use the software. In addition, you may make archival copies of 40 | the Software Product Installer. 41 | 42 | Right to Updates 43 | ================ 44 | 45 | This license entitles you to updates to the Software Product for one year from 46 | the time of payment. The Software Product will continue to function and you 47 | may continue to use The Software Product and any updates you have received for 48 | as long as you would like however you will no longer be able to receive 49 | updates from Continuum unless this License is renewed. Please contact 50 | sales@continuum.io with any questions or concerns. 51 | 52 | Restrictions on Transfer 53 | ======================== 54 | 55 | Without first obtaining the express written consent of Continuum, you may not 56 | assign your rights and obligations under this Agreement, or redistribute, 57 | encumber, sell, rent, lease, sublicense, or otherwise transfer your rights to 58 | the Software Product. 59 | 60 | Restrictions on Use 61 | =================== 62 | 63 | You may not use, copy, or install the Software Product on any system where 64 | more than one user will be able to use the software unless you have purchased 65 | a license for each user of the system. You may not decompile, 66 | "reverse-engineer", disassemble, or otherwise attempt to derive the source 67 | code for the Software Product. 68 | 69 | Restrictions on Alteration 70 | ========================== 71 | 72 | You may not modify the Software Product or create any derivative work of the 73 | Software Product or its accompanying documentation. Derivative works include 74 | but are not limited to translations. You may not alter any files or libraries 75 | in any portion of the Software Product. 76 | 77 | Restrictions on Copying 78 | ======================= 79 | 80 | You may not copy any part of the Software Product except to the extent that 81 | licensed use inherently demands the creation of a temporary copy stored in 82 | computer memory and not permanently affixed on storage medium. You may make 83 | archival copies of the Software Product installer. 84 | 85 | Limited Software Product Warranty 86 | ================================= 87 | 88 | For a period of 60 days from the date of shipment or from the date that you 89 | download the Software Product, as applicable, Continuum warrants that when 90 | properly installed and used under normal conditions, the Software Product will 91 | perform substantially as advertised. 92 | 93 | Disclaimer of Warranties and Limitation of Liability 94 | ==================================================== 95 | 96 | UNLESS OTHERWISE EXPLICITLY AGREED TO IN WRITING BY CONTINUUM, CONTINUUM MAKES 97 | NO OTHER WARRANTIES, EXPRESS OR IMPLIED, IN FACT OR IN LAW, INCLUDING, BUT NOT 98 | LIMITED TO, ANY IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A 99 | PARTICULAR PURPOSE OTHER THAN AS SET FORTH IN THIS AGREEMENT OR IN THE LIMITED 100 | WARRANTY DOCUMENTS PROVIDED WITH THE SOFTWARE PRODUCT. 101 | 102 | Continuum makes no warranty that the Software Product will meet your 103 | requirements or operate under your specific conditions of use. Continuum makes 104 | no warranty that operation of the Software Product will be secure, error free, 105 | or free from interruption. YOU MUST DETERMINE WHETHER THE SOFTWARE PRODUCT 106 | SUFFICIENTLY MEETS YOUR REQUIREMENTS FOR SECURITY AND UNINTERRUPTABILITY. YOU 107 | BEAR SOLE RESPONSIBILITY AND ALL LIABILITY FOR ANY LOSS INCURRED DUE TO 108 | FAILURE OF THE SOFTWARE PRODUCT TO MEET YOUR REQUIREMENTS. CONTINUUM WILL NOT, 109 | UNDER ANY CIRCUMSTANCES, BE RESPONSIBLE OR LIABLE FOR THE LOSS OF DATA ON ANY 110 | COMPUTER OR INFORMATION STORAGE DEVICE.ÿ UNDER NO CIRCUMSTANCES SHALL 111 | CONTINUUM, ITS DIRECTORS, OFFICERS, EMPLOYEES OR AGENTS BE LIABLE TO YOU OR 112 | ANY OTHER PARTY FOR INDIRECT, CONSEQUENTIAL, SPECIAL, INCIDENTAL, PUNITIVE, OR 113 | EXEMPLARY DAMAGES OF ANY KIND (INCLUDING LOST REVENUES OR PROFITS OR LOSS OF 114 | BUSINESS) RESULTING FROM THIS AGREEMENT, OR FROM THE FURNISHING, PERFORMANCE, 115 | INSTALLATION, OR USE OF THE SOFTWARE PRODUCT, WHETHER DUE TO A BREACH OF 116 | CONTRACT, BREACH OF WARRANTY, OR THE NEGLIGENCE OF CONTINUUM OR ANY OTHER 117 | PARTY, EVEN IF CONTINUUM IS ADVISED BEFOREHAND OF THE POSSIBILITY OF SUCH 118 | DAMAGES. TO THE EXTENT THAT THE APPLICABLE JURISDICTION LIMITS CONTINUUM'S 119 | ABILITY TO DISCLAIM ANY IMPLIED WARRANTIES, THIS DISCLAIMER SHALL BE EFFECTIVE 120 | TO THE MAXIMUM EXTENT PERMITTED. 121 | 122 | Limitation of Remedies and Damages 123 | ================================== 124 | 125 | Your remedy for a breach of this Agreement or of any warranty included in this 126 | Agreement is the correction or replacement of the Software Product. Selection 127 | of whether to correct or replace shall be solely at the discretion of 128 | Continuum. Continuum reserves the right to substitute a functionally 129 | equivalent copy of the Software Product as a replacement. If Continuum is 130 | unable to provide a replacement or substitute Software Product or corrections 131 | to the Software Product, your sole alternate remedy shall be a refund of the 132 | purchase price for the Software Product exclusive of any costs for shipping 133 | and handling. Any claim must be made within the applicable warranty period. 134 | All warranties cover only defects arising under normal use and do not include 135 | malfunctions or failure resulting from misuse, abuse, neglect, alteration, 136 | problems with electrical power, acts of nature, unusual temperatures or 137 | humidity, improper installation, or damage determined by Continuum to have 138 | been caused by you. All limited warranties on the Software Product are granted 139 | only to you and are non-transferable. You agree to indemnify and hold 140 | Continuum harmless from all claims, judgments, liabilities, expenses, or costs 141 | arising from your breach of this Agreement and/or acts or omissions. 142 | 143 | Governing Law, Jurisdiction and Costs 144 | ===================================== 145 | 146 | This Agreement is governed by the laws of Texas, without regard to Texas's 147 | conflict or choice of law provisions. 148 | 149 | Export Regulations 150 | ================== 151 | 152 | Any use or distribution of IOPro is made under conditions that the user and/or 153 | distributor is in full compliance with all export and other governing laws of 154 | the United States of America, including full and ongoing compliance with the 155 | Export Administration Regulations (EAR) of the United States Department of 156 | Commerce. See www.commerce.gov/ and 157 | http://www.bis.doc.gov/index.php/regulations/export-administration-regulations-ear. 158 | Use or distribution of Continuum software products to any persons, entities or 159 | countries currently under US sanctions is strictly prohibited. IOPro is 160 | classified with an ECCN of 5D992 with no license required for export to 161 | non-embargoed countires. 162 | 163 | The United States currently has embargoes against Cuba, Iran, North Korea, 164 | Sudan and Syria. The exportation, re-exportation, sale or supply, directly or 165 | indirectly, from the United States, or by a U.S. person wherever located, of 166 | any Continuum software to any of these countries is strictly prohibited 167 | without prior authorization by the United States Government By accepting this 168 | Agreement, you represent to Continuum that you will comply with all applicable 169 | export regulations for IOPro. 170 | 171 | 172 | Severability 173 | ============ 174 | 175 | If any provision of this Agreement shall be held to be invalid or 176 | unenforceable, the remainder of this Agreement shall remain in full force and 177 | effect. To the extent any express or implied restrictions are not permitted by 178 | applicable laws, these express or implied restrictions shall remain in force 179 | and effect to the maximum extent permitted by such applicable laws. 180 | -------------------------------------------------------------------------------- /docs/genfromtxt.rst: -------------------------------------------------------------------------------- 1 | ---------------- 2 | iopro.genfromtxt 3 | ---------------- 4 | 5 | Load data from a text file, with missing values handled as specified. 6 | 7 | Each line past the first `skip_header` lines is split at the `delimiter` 8 | character, and characters following the `comments` character are discarded. 9 | 10 | Parameters 11 | ---------- 12 | fname : file or str 13 | File, filename, or generator to read. If the filename extension is 14 | `.gz` or `.bz2`, the file is first decompressed. Note that 15 | generators must return byte strings in Python 3k. 16 | dtype : dtype, optional 17 | Data type of the resulting array. 18 | If None, the dtypes will be determined by the contents of each 19 | column, individually. 20 | comments : str, optional 21 | The character used to indicate the start of a comment. 22 | All the characters occurring on a line after a comment are discarded 23 | delimiter : str, int, or sequence, optional 24 | The string used to separate values. By default, any consecutive 25 | whitespaces act as delimiter. An integer or sequence of integers 26 | can also be provided as width(s) of each field. 27 | skip_header : int, optional 28 | The numbers of lines to skip at the beginning of the file. 29 | skip_footer : int, optional 30 | The numbers of lines to skip at the end of the file 31 | converters : variable, optional 32 | The set of functions that convert the data of a column to a value. 33 | The converters can also be used to provide a default value 34 | for missing data: ``converters = {3: lambda s: float(s or 0)}``. 35 | missing_values : variable, optional 36 | The set of strings corresponding to missing data. 37 | filling_values : variable, optional 38 | The set of values to be used as default when the data are missing. 39 | usecols : sequence, optional 40 | Which columns to read, with 0 being the first. For example, 41 | ``usecols = (1, 4, 5)`` will extract the 2nd, 5th and 6th columns. 42 | names : {None, True, str, sequence}, optional 43 | If `names` is True, the field names are read from the first valid line 44 | after the first `skip_header` lines. 45 | If `names` is a sequence or a single-string of comma-separated names, 46 | the names will be used to define the field names in a structured dtype. 47 | If `names` is None, the names of the dtype fields will be used, if any. 48 | excludelist : sequence, optional 49 | A list of names to exclude. This list is appended to the default list 50 | ['return','file','print']. Excluded names are appended an underscore: 51 | for example, `file` would become `file_`. 52 | deletechars : str, optional 53 | A string combining invalid characters that must be deleted from the 54 | names. 55 | defaultfmt : str, optional 56 | A format used to define default field names, such as "f%i" or "f_%02i". 57 | autostrip : bool, optional 58 | Whether to automatically strip white spaces from the variables. 59 | replace_space : char, optional 60 | Character(s) used in replacement of white spaces in the variables 61 | names. By default, use a '_'. 62 | case_sensitive : {True, False, 'upper', 'lower'}, optional 63 | If True, field names are case sensitive. 64 | If False or 'upper', field names are converted to upper case. 65 | If 'lower', field names are converted to lower case. 66 | unpack : bool, optional 67 | If True, the returned array is transposed, so that arguments may be 68 | unpacked using ``x, y, z = loadtxt(...)`` 69 | usemask : bool, optional 70 | If True, return a masked array. 71 | If False, return a regular array. 72 | invalid_raise : bool, optional 73 | If True, an exception is raised if an inconsistency is detected in the 74 | number of columns. 75 | If False, a warning is emitted and the offending lines are skipped. 76 | 77 | Returns 78 | ------- 79 | out : ndarray 80 | Data read from the text file. If `usemask` is True, this is a 81 | masked array. 82 | 83 | See Also 84 | -------- 85 | iopro.loadtxt : equivalent function when no data is missing. 86 | 87 | Notes 88 | ----- 89 | * When spaces are used as delimiters, or when no delimiter has been given 90 | as input, there should not be any missing data between two fields. 91 | * When the variables are named (either by a flexible dtype or with `names`, 92 | there must not be any header in the file (else a ValueError 93 | exception is raised). 94 | * Individual values are not stripped of spaces by default. 95 | When using a custom converter, make sure the function does remove spaces. 96 | 97 | Examples 98 | --------- 99 | >>> import iopro 100 | >>> from io import StringIO 101 | 102 | Comma delimited file with mixed dtype 103 | 104 | >>> s = StringIO("1,1.3,abcde") 105 | >>> data = iopro.genfromtxt(s, dtype=[('myint','i8'),('myfloat','f8'), 106 | ... ('mystring','S5')], delimiter=",") 107 | >>> data 108 | array((1, 1.3, 'abcde'), 109 | dtype=[('myint', '>> s.seek(0) # needed for StringIO example only 114 | >>> data = iopro.genfromtxt(s, dtype=None, 115 | ... names = ['myint','myfloat','mystring'], delimiter=",") 116 | >>> data 117 | array((1, 1.3, 'abcde'), 118 | dtype=[('myint', '>> s.seek(0) 123 | >>> data = iopro.genfromtxt(s, dtype="i8,f8,S5", 124 | ... names=['myint','myfloat','mystring'], delimiter=",") 125 | >>> data 126 | array((1, 1.3, 'abcde'), 127 | dtype=[('myint', '>> s = StringIO("11.3abcde") 132 | >>> data = iopro.genfromtxt(s, dtype=None, names=['intvar','fltvar','strvar'], 133 | ... delimiter=[1,3,5]) 134 | >>> data 135 | array((1, 1.3, 'abcde'), 136 | dtype=[('intvar', '`_. 21 | 22 | To start a 30-day free trial just download and install the IOPro package. 23 | 24 | If you already have `Anaconda `_ (free 25 | Python platform) or `Miniconda `_ 26 | installed:: 27 | 28 | conda update conda 29 | conda install iopro 30 | 31 | If you do not have Anaconda installed, you can `download it 32 | `_. 33 | 34 | For more information about IOPro please contact `sales@continuum.io 35 | `_. 36 | 37 | Requirements 38 | ------------ 39 | 40 | * Python 2.7 or 3.4+ 41 | * NumPy 1.10+ 42 | 43 | Optional Python modules: 44 | 45 | * Boto (for S3 support) 46 | * Pandas (to use DataFrames) 47 | 48 | What's new in version 1.9? 49 | -------------------------- 50 | 51 | The documentation has been substantially updated for version 1.9.0. 52 | Numba has been removed and the code has been cleaned up, but no other 53 | features were added or removed. Some refactoring was done that didn't 54 | change functionality. We recommend that users not use older versions. 55 | See :doc:`Release notes ` for additional detail. 56 | 57 | 58 | Getting started 59 | --------------- 60 | 61 | Some of the basic usage patterns look like these. Create TextAdapter object 62 | for data source:: 63 | 64 | >>> import iopro 65 | >>> adapter = iopro.text_adapter('data.csv', parser='csv') 66 | 67 | Define field dtypes (example: set field 0 to unsigned int and field 4 to 68 | float):: 69 | 70 | >>> adapter.set_field_types({0: 'u4', 4:'f4'}) 71 | 72 | Parse text and store records in NumPy array using slicing notation:: 73 | 74 | >>> # read all records 75 | >>> array = adapter[:] 76 | 77 | >>> # read first ten records 78 | >>> array = adapter[0:10] 79 | 80 | >>> # read last record 81 | >>> array = adapter[-1] 82 | 83 | >>> # read every other record 84 | >>> array = adapter[::2] 85 | 86 | User guide 87 | ---------- 88 | 89 | .. toctree:: 90 | :maxdepth: 1 91 | 92 | install 93 | textadapter_examples 94 | eula 95 | release-notes 96 | 97 | Reference guide 98 | --------------- 99 | 100 | .. toctree:: 101 | :maxdepth: 1 102 | 103 | TextAdapter 104 | loadtxt 105 | genfromtxt 106 | 107 | 108 | Previous Versions 109 | ----------------- 110 | 111 | This documentation is provided for the use of our customers who have not yet upgraded 112 | to the current version. 113 | 114 | NOTE: We recommend that users not use older versions of IOPro. 115 | 116 | .. toctree:: 117 | :maxdepth: 1 118 | 119 | IOPro 1.8.0 <1.8.0/index> 120 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | If you do not already have Anaconda installed, please download it via the 5 | `downloads page `_ and install it. 6 | 7 | IOPro is included with `Anaconda Workgroup and Anaconda Enterprise 8 | subscriptions `_. 9 | 10 | To start a 30-day free trial just download and install the IOPro package. 11 | 12 | If you already have `Anaconda `_ (free 13 | Python platform) or `Miniconda ` 14 | installed:: 15 | 16 | conda update conda 17 | conda install iopro 18 | 19 | If you do not have Anaconda installed, you can `download it 20 | `_. 21 | 22 | For more information about IOPro please contact `sales@continuum.io 23 | `_. 24 | 25 | IOPro Update Instructions 26 | ------------------------- 27 | 28 | If you have Anaconda (free Python platform) installed, first update 29 | the conda package management tool to the latest version, then use conda 30 | to update the IOPro product installation:: 31 | 32 | conda update conda 33 | conda update iopro 34 | 35 | Uninstall 36 | --------- 37 | 38 | To uninstall using conda:: 39 | 40 | conda remove iopro 41 | 42 | 43 | Installing license 44 | ------------------ 45 | 46 | The IOPro license can be installed with the graphical Anaconda Navigator license 47 | manager or manually with your operating system. In your organization this may be 48 | handled by your site administrator or IT department. Both installation methods 49 | are explained in the :doc:`License installation ` 50 | page. 51 | -------------------------------------------------------------------------------- /docs/loadtxt.rst: -------------------------------------------------------------------------------- 1 | ------------- 2 | iopro.loadtxt 3 | ------------- 4 | 5 | Load data from a text file. 6 | 7 | Each row in the text file must have the same number of values. 8 | 9 | Parameters 10 | ---------- 11 | fname : file or str 12 | File, filename, or generator to read. If the filename extension is 13 | ``.gz`` or ``.bz2``, the file is first decompressed. Note that 14 | generators should return byte strings for Python 3k. 15 | dtype : data-type, optional 16 | Data-type of the resulting array; default: float. If this is a 17 | record data-type, the resulting array will be 1-dimensional, and 18 | each row will be interpreted as an element of the array. In this 19 | case, the number of columns used must match the number of fields in 20 | the data-type. 21 | comments : str, optional 22 | The character used to indicate the start of a comment; 23 | default: '#'. 24 | delimiter : str, optional 25 | The string used to separate values. By default, this is any 26 | whitespace. 27 | converters : dict, optional 28 | A dictionary mapping column number to a function that will convert 29 | that column to a float. E.g., if column 0 is a date string: 30 | ``converters = {0: datestr2num}``. Converters can also be used to 31 | provide a default value for missing data (but see also `iopro.genfromtxt`): 32 | ``converters = {3: lambda s: float(s.strip() or 0)}``. Default: None. 33 | skiprows : int, optional 34 | Skip the first `skiprows` lines; default: 0. 35 | usecols : sequence, optional 36 | Which columns to read, with 0 being the first. For example, 37 | ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns. 38 | The default, None, results in all columns being read. 39 | unpack : bool, optional 40 | If True, the returned array is transposed, so that arguments may be 41 | unpacked using ``x, y, z = iopro.loadtxt(...)``. When used with a record 42 | data-type, arrays are returned for each field. Default is False. 43 | ndmin : int, optional 44 | The returned array will have at least `ndmin` dimensions. 45 | Otherwise mono-dimensional axes will be squeezed. 46 | Legal values: 0 (default), 1 or 2. 47 | .. versionadded:: 1.6.0 48 | 49 | Returns 50 | ------- 51 | out : ndarray 52 | Data read from the text file. 53 | 54 | See Also 55 | -------- 56 | iopro.genfromtxt : Load data with missing values handled as specified. 57 | 58 | Examples 59 | -------- 60 | 61 | simple parse of StringIO object data 62 | >>> import iopro 63 | >>> from io import StringIO # StringIO behaves like a file object 64 | >>> c = StringIO("0 1\\n2 3") 65 | >>> iopro.loadtxt(c) 66 | >>> array([[ 0., 1.], 67 | [ 2., 3.]]) 68 | 69 | set dtype of output array 70 | >>> d = StringIO("M 21 72\\nF 35 58") 71 | >>> iopro.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'), 72 | ... 'formats': ('S1', 'i4', 'f4')}) 73 | >>> array([('M', 21, 72.0), ('F', 35, 58.0)], 74 | dtype=[('gender', '|S1'), ('age', '>> c = StringIO("1,0,2\\n3,0,4") 78 | >>> x, y = iopro.loadtxt(c, delimiter=',', usecols=(0, 2), unpack=True) 79 | >>> x 80 | >>> array([ 1., 3.]) 81 | >>> y 82 | >>> array([ 2., 4.]) 83 | 84 | 85 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\IOPro.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\IOPro.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /docs/release-notes.rst: -------------------------------------------------------------------------------- 1 | ../CHANGELOG -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: textadapter 2 | dependencies: 3 | - ipython 4 | - numpy 5 | - pandas 6 | - pytest 7 | - cython 8 | - pcre 9 | - zlib 10 | - nose 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from distutils.core import setup, Command 4 | from distutils.extension import Extension 5 | from Cython.Distutils import build_ext 6 | import numpy 7 | import versioneer 8 | 9 | 10 | class CleanInplace(Command): 11 | user_options = [] 12 | 13 | def initialize_options(self): 14 | self.cwd = None 15 | 16 | def finalize_options(self): 17 | self.cwd = os.getcwd() 18 | 19 | def run(self): 20 | files = ['./textadapter/core/TextAdapter.c', 21 | './textadapter/core/TextAdapter.so'] 22 | for file in files: 23 | try: 24 | os.remove(file) 25 | except OSError: 26 | pass 27 | 28 | 29 | def setup_text(include_dirs, lib_dirs): 30 | src = ['textadapter/core/TextAdapter.pyx', 31 | 'textadapter/core/text_adapter.c', 32 | 'textadapter/lib/converter_functions.c', 33 | 'textadapter/core/io_functions.c', 34 | 'textadapter/lib/field_info.c', 35 | 'textadapter/core/json_tokenizer.c'] 36 | 37 | if sys.platform == 'win32': 38 | zlib_lib = 'zlibstatic' 39 | else: 40 | zlib_lib = 'z' 41 | 42 | compile_args = [] 43 | if '--debug' in sys.argv: 44 | if sys.platform == 'win32': 45 | compile_args.append('/DDEBUG_ADAPTER') 46 | else: 47 | compile_args.append('-DDEBUG_ADAPTER') 48 | 49 | libraries = ['pcre', zlib_lib] 50 | include_dirs = ['textadapter/core'] + include_dirs 51 | 52 | return Extension("textadapter.core.TextAdapter", 53 | src, 54 | include_dirs=include_dirs, 55 | library_dirs=lib_dirs, 56 | libraries=libraries, 57 | extra_compile_args=compile_args) 58 | 59 | 60 | def run_setup(): 61 | include_dirs = [os.path.join('textadapter', 'lib'), 62 | numpy.get_include()] 63 | if sys.platform == 'win32': 64 | include_dirs.append(os.path.join(sys.prefix, 'Library', 'include')) 65 | else: 66 | include_dirs.append(os.path.join(sys.prefix, 'include')) 67 | 68 | lib_dirs = [] 69 | if sys.platform == 'win32': 70 | lib_dirs.append(os.path.join(sys.prefix, 'Library', 'lib')) 71 | else: 72 | lib_dirs.append(os.path.join(sys.prefix, 'lib')) 73 | 74 | ext_modules = [] 75 | packages = ['textadapter', 'textadapter.lib', 'textadapter.tests'] 76 | 77 | ext_modules.append(setup_text(include_dirs, lib_dirs)) 78 | packages.append('textadapter.core') 79 | 80 | versioneer.versionfile_source = 'textadapter/_version.py' 81 | versioneer.versionfile_build = 'textadapter/_version.py' 82 | versioneer.tag_prefix = '' 83 | versioneer.parentdir_prefix = 'textadapter-' 84 | 85 | cmdclass = versioneer.get_cmdclass() 86 | cmdclass['build_ext'] = build_ext 87 | cmdclass['cleanall'] = CleanInplace 88 | 89 | setup(name='textadapter', 90 | version = versioneer.get_version(), 91 | description='optimized IO for NumPy/Blaze', 92 | author='Continuum Analytics', 93 | author_email='david.mertz@continuum.io', 94 | ext_modules=ext_modules, 95 | packages=packages, 96 | cmdclass=cmdclass) 97 | 98 | 99 | if __name__ == '__main__': 100 | run_setup() 101 | -------------------------------------------------------------------------------- /setupegg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Wrapper to run setup.py using setuptools.""" 3 | 4 | import os, sys 5 | 6 | # now, import setuptools and call the actual setup 7 | import setuptools 8 | try: 9 | execfile('setup.py') 10 | except NameError: 11 | exec( open('setup.py','rb').read() ) 12 | -------------------------------------------------------------------------------- /textadapter/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | TextAdapter 3 | ~~~~~ 4 | 5 | TextAdapter provides tools to interface large data files in a fast, memory-efficient way. 6 | """ 7 | from __future__ import absolute_import 8 | 9 | from textadapter._version import get_versions 10 | __version__ = get_versions()['version'] 11 | del get_versions 12 | 13 | from textadapter.core.TextAdapter import (ArrayDealloc, CSVTextAdapter, 14 | FixedWidthTextAdapter, JSONTextAdapter, 15 | RegexTextAdapter, s3_text_adapter, 16 | text_adapter) 17 | from textadapter.core.loadtxt import loadtxt 18 | from textadapter.core.genfromtxt import genfromtxt 19 | from textadapter.lib.errors import (AdapterException, AdapterIndexError, 20 | ArgumentError, ConfigurationError, 21 | DataIndexError, DataTypeError, 22 | InternalInconsistencyError, NoSuchFieldError, 23 | ParserError, SourceError, SourceNotFoundError) 24 | 25 | 26 | def test(verbosity=1, num_records=100000, results=[]): 27 | from textadapter.tests.test_TextAdapter import run as run_textadapter_tests 28 | result_text = run_textadapter_tests(verbosity=verbosity, 29 | num_records=num_records) 30 | results.append(result_text) 31 | 32 | from textadapter.tests.test_io import run as run_io_tests 33 | result_text = run_io_tests(verbosity=verbosity) 34 | results.append(result_text) 35 | 36 | for result in results: 37 | if not result.wasSuccessful(): 38 | return False 39 | return True 40 | 41 | -------------------------------------------------------------------------------- /textadapter/_version.py: -------------------------------------------------------------------------------- 1 | IN_LONG_VERSION_PY = True 2 | # This file helps to compute a version number in source trees obtained from 3 | # git-archive tarball (such as those provided by github's download-from-tag 4 | # feature). Distribution tarballs (build by setup.py sdist) and build 5 | # directories (produced by setup.py build) will contain a much shorter file 6 | # that just contains the computed version number. 7 | 8 | # This file is released into the public domain. Generated by 9 | # versioneer-0.7+ (https://github.com/warner/python-versioneer) 10 | 11 | # these strings will be replaced by git during git-archive 12 | git_refnames = " (HEAD -> master, tag: v2.0.0)" 13 | git_full = "53138c2277cdfcf32e127251313d4f77f81050aa" 14 | 15 | GIT = "git" 16 | 17 | 18 | import subprocess 19 | import sys 20 | 21 | def run_command(args, cwd=None, verbose=False): 22 | try: 23 | # remember shell=False, so use git.cmd on windows, not just git 24 | p = subprocess.Popen(args, stdout=subprocess.PIPE, cwd=cwd) 25 | except EnvironmentError: 26 | e = sys.exc_info()[1] 27 | if verbose: 28 | print("unable to run %s" % args[0]) 29 | print(e) 30 | return None 31 | stdout = p.communicate()[0].strip() 32 | if sys.version >= '3': 33 | stdout = stdout.decode() 34 | if p.returncode != 0: 35 | if verbose: 36 | print("unable to run %s (error)" % args[0]) 37 | return None 38 | return stdout 39 | 40 | 41 | import sys 42 | import re 43 | import os.path 44 | 45 | def get_expanded_variables(versionfile_source): 46 | # the code embedded in _version.py can just fetch the value of these 47 | # variables. When used from setup.py, we don't want to import 48 | # _version.py, so we do it with a regexp instead. This function is not 49 | # used from _version.py. 50 | variables = {} 51 | try: 52 | for line in open(versionfile_source,"r").readlines(): 53 | if line.strip().startswith("git_refnames ="): 54 | mo = re.search(r'=\s*"(.*)"', line) 55 | if mo: 56 | variables["refnames"] = mo.group(1) 57 | if line.strip().startswith("git_full ="): 58 | mo = re.search(r'=\s*"(.*)"', line) 59 | if mo: 60 | variables["full"] = mo.group(1) 61 | except EnvironmentError: 62 | pass 63 | return variables 64 | 65 | def versions_from_expanded_variables(variables, tag_prefix, verbose=False): 66 | refnames = variables["refnames"].strip() 67 | if refnames.startswith("$Format"): 68 | if verbose: 69 | print("variables are unexpanded, not using") 70 | return {} # unexpanded, so not in an unpacked git-archive tarball 71 | refs = set([r.strip() for r in refnames.strip("()").split(",")]) 72 | for ref in list(refs): 73 | if not re.search(r'\d', ref): 74 | if verbose: 75 | print("discarding '%s', no digits" % ref) 76 | refs.discard(ref) 77 | # Assume all version tags have a digit. git's %d expansion 78 | # behaves like git log --decorate=short and strips out the 79 | # refs/heads/ and refs/tags/ prefixes that would let us 80 | # distinguish between branches and tags. By ignoring refnames 81 | # without digits, we filter out many common branch names like 82 | # "release" and "stabilization", as well as "HEAD" and "master". 83 | if verbose: 84 | print("remaining refs: %s" % ",".join(sorted(refs))) 85 | for ref in sorted(refs): 86 | # sorting will prefer e.g. "2.0" over "2.0rc1" 87 | if ref.startswith(tag_prefix): 88 | r = ref[len(tag_prefix):] 89 | if verbose: 90 | print("picking %s" % r) 91 | return { "version": r, 92 | "full": variables["full"].strip() } 93 | # no suitable tags, so we use the full revision id 94 | if verbose: 95 | print("no suitable tags, using full revision id") 96 | return { "version": variables["full"].strip(), 97 | "full": variables["full"].strip() } 98 | 99 | def versions_from_vcs(tag_prefix, versionfile_source, verbose=False): 100 | # this runs 'git' from the root of the source tree. That either means 101 | # someone ran a setup.py command (and this code is in versioneer.py, so 102 | # IN_LONG_VERSION_PY=False, thus the containing directory is the root of 103 | # the source tree), or someone ran a project-specific entry point (and 104 | # this code is in _version.py, so IN_LONG_VERSION_PY=True, thus the 105 | # containing directory is somewhere deeper in the source tree). This only 106 | # gets called if the git-archive 'subst' variables were *not* expanded, 107 | # and _version.py hasn't already been rewritten with a short version 108 | # string, meaning we're inside a checked out source tree. 109 | 110 | try: 111 | here = os.path.abspath(__file__) 112 | except NameError: 113 | # some py2exe/bbfreeze/non-CPython implementations don't do __file__ 114 | return {} # not always correct 115 | 116 | # versionfile_source is the relative path from the top of the source tree 117 | # (where the .git directory might live) to this file. Invert this to find 118 | # the root from __file__. 119 | root = here 120 | if IN_LONG_VERSION_PY: 121 | for i in range(len(versionfile_source.split("/"))): 122 | root = os.path.dirname(root) 123 | else: 124 | root = os.path.dirname(here) 125 | if not os.path.exists(os.path.join(root, ".git")): 126 | if verbose: 127 | print("no .git in %s" % root) 128 | return {} 129 | 130 | stdout = run_command([GIT, "describe", "--tags", "--always"], 131 | cwd=root) 132 | if stdout is None: 133 | return {} 134 | if not stdout.startswith(tag_prefix): 135 | if verbose: 136 | print("tag '%s' doesn't start with prefix '%s'" % (stdout, tag_prefix)) 137 | return {} 138 | tag = stdout[len(tag_prefix):] 139 | stdout = run_command([GIT, "rev-parse", "HEAD"], cwd=root) 140 | if stdout is None: 141 | return {} 142 | full = stdout.strip() 143 | return {"version": tag, "full": full} 144 | 145 | 146 | def versions_from_parentdir(parentdir_prefix, versionfile_source, verbose=False): 147 | if IN_LONG_VERSION_PY: 148 | # We're running from _version.py. If it's from a source tree 149 | # (execute-in-place), we can work upwards to find the root of the 150 | # tree, and then check the parent directory for a version string. If 151 | # it's in an installed application, there's no hope. 152 | try: 153 | here = os.path.abspath(__file__) 154 | except NameError: 155 | # py2exe/bbfreeze/non-CPython don't have __file__ 156 | return {} # without __file__, we have no hope 157 | # versionfile_source is the relative path from the top of the source 158 | # tree to _version.py. Invert this to find the root from __file__. 159 | root = here 160 | for i in range(len(versionfile_source.split("/"))): 161 | root = os.path.dirname(root) 162 | else: 163 | # we're running from versioneer.py, which means we're running from 164 | # the setup.py in a source tree. sys.argv[0] is setup.py in the root. 165 | here = os.path.abspath(sys.argv[0]) 166 | root = os.path.dirname(here) 167 | 168 | # Source tarballs conventionally unpack into a directory that includes 169 | # both the project name and a version string. 170 | dirname = os.path.basename(root) 171 | if not dirname.startswith(parentdir_prefix): 172 | if verbose: 173 | print("guessing rootdir is '%s', but '%s' doesn't start with prefix '%s'" % 174 | (root, dirname, parentdir_prefix)) 175 | return None 176 | return {"version": dirname[len(parentdir_prefix):], "full": ""} 177 | 178 | tag_prefix = "" 179 | parentdir_prefix = "textadapter-" 180 | versionfile_source = "textadapter/_version.py" 181 | 182 | def get_versions(default={"version": "unknown", "full": ""}, verbose=False): 183 | variables = { "refnames": git_refnames, "full": git_full } 184 | ver = versions_from_expanded_variables(variables, tag_prefix, verbose) 185 | if not ver: 186 | ver = versions_from_vcs(tag_prefix, versionfile_source, verbose) 187 | if not ver: 188 | ver = versions_from_parentdir(parentdir_prefix, versionfile_source, 189 | verbose) 190 | if not ver: 191 | ver = default 192 | return ver 193 | 194 | -------------------------------------------------------------------------------- /textadapter/core/IO.pyx: -------------------------------------------------------------------------------- 1 | 2 | cdef InputData* open_s3(object data): 3 | """ 4 | Set up read/seek functions for S3 data source 5 | """ 6 | cdef InputData *input_data = calloc(1, sizeof(InputData)) 7 | input_data.seek = &seek_s3 8 | input_data.read = &read_s3 9 | input_data.close = &close_s3 10 | input_data.input = data 11 | return input_data 12 | 13 | cdef void close_s3(InputData *input_data): 14 | """ 15 | Clean up InputData for S3 data source 16 | """ 17 | if input_data != NULL: 18 | free(input_data) 19 | 20 | cdef AdapterError seek_s3(InputData *input, uint64_t offset): 21 | """ 22 | Seek to offset in S3 data source 23 | 24 | Arguments: 25 | input - InputData struct 26 | offset - offset to seek to 27 | """ 28 | if (input == NULL): 29 | return ADAPTER_ERROR_SEEK; 30 | 31 | s3_input = input.input 32 | 33 | s3_key = s3_input['s3_key'] 34 | if offset > (s3_key.size - input.header): 35 | return ADAPTER_ERROR_SEEK_S3 36 | 37 | s3_input['offset'] = offset + input.header 38 | return ADAPTER_SUCCESS 39 | 40 | 41 | cdef AdapterError read_s3(InputData *input, char *buffer, uint64_t buffer_len, uint64_t *num_bytes_read): 42 | """ 43 | Read bytes from S3 data source and store in buffer. 44 | 45 | Arguments: 46 | input - text adapter struct 47 | buffer - output buffer for data read from S3 48 | buffer_len - length of buffer 49 | num_bytes_read - pointer to variable for storing number of bytes read from S3 50 | """ 51 | if (input == NULL): 52 | return ADAPTER_ERROR_SEEK; 53 | 54 | s3_input = input.input 55 | offset = s3_input['offset'] 56 | s3_key = s3_input['s3_key'] 57 | 58 | if offset >= s3_key.size: 59 | num_bytes_read[0] = 0 60 | return ADAPTER_ERROR_READ_EOF 61 | 62 | if offset < 0: 63 | return ADAPTER_ERROR_READ 64 | 65 | try: 66 | data = s3_key.get_contents_as_string(headers={'Range' : 'bytes={0}-{1}'.format(offset, offset+buffer_len)}) 67 | except: 68 | return ADAPTER_ERROR_READ_S3 69 | data_len = len(data) 70 | 71 | if data_len > buffer_len: 72 | data_len = buffer_len 73 | 74 | memcpy(buffer, data, data_len) 75 | num_bytes_read[0] = data_len 76 | 77 | s3_input['offset'] = s3_input['offset'] + data_len 78 | 79 | return ADAPTER_SUCCESS 80 | -------------------------------------------------------------------------------- /textadapter/core/TextAdapter.pxd: -------------------------------------------------------------------------------- 1 | cimport numpy 2 | 3 | cdef extern from '_stdint.h': 4 | # Actual type lengths are defined in _stdint.h 5 | # Sizes here are just place holders 6 | ctypedef unsigned long long uint64_t 7 | ctypedef unsigned int uint32_t 8 | ctypedef unsigned short uint16_t 9 | ctypedef unsigned char uint8_t 10 | ctypedef long long int64_t 11 | ctypedef int int32_t 12 | ctypedef short int16_t 13 | ctypedef char int8_t 14 | uint64_t UINT64_MAX 15 | 16 | cdef extern from 'string.h': 17 | void *memcpy(void *, void *, size_t) 18 | char *strncpy(char *, char *, size_t) 19 | void *memset(void *, int, size_t) 20 | 21 | cdef extern from "Python.h": 22 | ctypedef struct PyObject 23 | ctypedef struct FILE 24 | FILE* PyFile_AsFile(object) 25 | 26 | cdef extern from 'stdlib.h': 27 | void* calloc(size_t, size_t) 28 | void* malloc(size_t) 29 | void* realloc(void *, size_t) 30 | void free(void *) 31 | 32 | 33 | cdef extern from "../lib/khash.h": 34 | 35 | ctypedef uint32_t khint_t 36 | ctypedef khint_t khiter_t 37 | ctypedef char* kh_cstr_t 38 | 39 | ctypedef struct kh_string_t: 40 | khint_t n_buckets, size, n_occupied, upper_bound 41 | khint_t *flags 42 | kh_cstr_t *keys 43 | PyObject **vals 44 | 45 | kh_string_t* kh_init_string() 46 | void kh_destroy_string(kh_string_t*) 47 | khint_t kh_get_string(kh_string_t*, kh_cstr_t) 48 | khint_t kh_put_string(kh_string_t*, kh_cstr_t, int*) 49 | khint_t kh_str_hash_func(const char *s) 50 | khint_t kh_exist(kh_string_t*, khint_t) 51 | 52 | cdef extern from "numpy/arrayobject.h": 53 | object PyArray_NewFromDescr(object subtype, numpy.dtype descr, 54 | int nd, numpy.npy_intp* dims, 55 | numpy.npy_intp* strides, void* data, 56 | int flags, object obj) 57 | struct PyArray_Descr: 58 | int type_num, elsize 59 | char type 60 | 61 | cdef enum: 62 | NPY_MAXDIMS 63 | 64 | 65 | cdef extern from "zlib.h": 66 | int inflateEnd(void *) 67 | 68 | cdef extern from "io_functions.h": 69 | InputData* open_file(const char *filename) 70 | void close_file(InputData *input) 71 | AdapterError read_file(InputData *input, char *buffer, uint64_t len, 72 | uint64_t *num_bytes_read) 73 | AdapterError seek_file(InputData *input, uint64_t offset) 74 | 75 | InputData* open_memmap(char *data, size_t size) 76 | void close_memmap(InputData *input) 77 | AdapterError read_memmap(InputData *input, char *buffer, uint64_t len, 78 | uint64_t *num_bytes_read) 79 | AdapterError seek_memmap(InputData *input, uint64_t offset) 80 | 81 | AdapterError read_gzip(InputData *input, char *buffer, uint64_t len, 82 | uint64_t *num_bytes_read) 83 | AdapterError seek_gzip(InputData *input, uint64_t offset) 84 | 85 | void init_gzip(InputData *input) 86 | void close_gzip(InputData *input) 87 | 88 | cdef extern from 'converter_functions.h': 89 | 90 | ctypedef enum ConvertError: 91 | CONVERT_SUCCESS 92 | CONVERT_ERROR_UNKNOWN 93 | CONVERT_ERROR_OVERFLOW 94 | CONVERT_ERROR_INPUT_TYPE 95 | CONVERT_ERROR_INPUT_SIZE 96 | CONVERT_ERROR_OUTPUT_SIZE 97 | CONVERT_ERROR_INPUT_STRING 98 | CONVERT_ERROR_USER_CONVERTER 99 | CONVERT_ERROR_OBJECT_CONVERTER 100 | CONVERT_ERROR_NUMBA 101 | CONVERT_ERROR_LAST 102 | 103 | ctypedef ConvertError (*converter_func_ptr)(void *, uint32_t, int, 104 | void *, uint32_t, void *) 105 | 106 | ConvertError str2int_converter(void *input, uint32_t input_len, 107 | int input_type, void *output, uint32_t output_len, void *arg) 108 | ConvertError str2uint_converter(void *input, uint32_t input_len, 109 | int input_type, void *output, uint32_t output_len, void *arg) 110 | ConvertError str2float_converter(void *input, uint32_t input_len, 111 | int input_type, void *output, uint32_t output_len, void *arg) 112 | ConvertError str2str_converter(void *input, uint32_t input_len, 113 | int input_type, void *output, uint32_t output_len, void *arg) 114 | ConvertError str2complex_converter(void *input, uint32_t input_len, 115 | int input_type, void *output, uint32_t output_len, void *arg) 116 | 117 | cdef extern from 'index.h': 118 | enum: UNCOMPRESSED_WINDOW_SIZE 119 | enum: DEFAULT_INDEX_DENSITY 120 | enum: GZIP_ACCESS_POINT_DISTANCE 121 | 122 | ctypedef struct RecordOffset: 123 | uint64_t record_num 124 | uint64_t offset 125 | 126 | ctypedef struct GzipIndexAccessPoint: 127 | uint8_t bits 128 | uint64_t compressed_offset 129 | uint64_t uncompressed_offset 130 | unsigned char window[UNCOMPRESSED_WINDOW_SIZE] 131 | 132 | ctypedef void (*indexer_func_ptr)(void *index, uint64_t record_num, 133 | uint64_t record_offset) 134 | ctypedef RecordOffset (*index_lookup_func_ptr)(void *index, 135 | uint64_t record_num) 136 | 137 | ctypedef void (*add_gzip_access_point_func_ptr)(void *index, 138 | unsigned char *buffer, 139 | uint32_t compressed_offset, uint64_t uncompressed_offset, 140 | int avail_in, int avail_out, uint8_t data_type) 141 | 142 | ctypedef void (*get_gzip_access_point_func_ptr)(void *index, 143 | uint64_t offset, GzipIndexAccessPoint *point) 144 | 145 | cdef extern from 'field_info.h': 146 | ctypedef struct MissingValues: 147 | char **missing_values 148 | uint32_t *missing_value_lens 149 | uint32_t num_missing_values 150 | 151 | ctypedef struct FillValue: 152 | void *fill_value 153 | int loose 154 | 155 | ctypedef struct FieldInfo: 156 | char *name 157 | converter_func_ptr converter 158 | void *converter_arg 159 | MissingValues missing_values 160 | FillValue fill_value 161 | uint32_t input_field_width 162 | uint32_t output_field_size 163 | int infer_type 164 | 165 | ctypedef struct FieldList: 166 | uint32_t num_fields 167 | FieldInfo *field_info 168 | 169 | void set_num_fields(FieldList *fields, uint32_t num_fields) 170 | void init_missing_values(FieldList *fields, char *field_name, 171 | uint32_t field_num, uint32_t num_missing_values) 172 | void add_missing_value(FieldList *fields, char *field_name, 173 | uint32_t field_num, char *missing_value, uint32_t missing_value_len) 174 | void set_fill_value(FieldList *fields, char *field_name, 175 | uint32_t field_num, void *fill_value, uint32_t fill_value_len, int loose) 176 | uint32_t get_field_size(FieldList *fields, char *field_name, uint32_t field_num) 177 | 178 | ctypedef enum DefaultConverterFuncs: 179 | UINT_CONVERTER_FUNC 180 | INT_CONVERTER_FUNC 181 | FLOAT_CONVERTER_FUNC 182 | STRING_CONVERTER_FUNC 183 | STRING_OBJECT_CONVERTER_FUNC 184 | NUM_CONVERTER_FUNCS 185 | 186 | void set_field_width(FieldList *fields, uint32_t field, uint32_t width) 187 | void reset_converters(FieldList *fields) 188 | void set_converter(FieldList *fields, uint32_t field_num, char *field_name, 189 | uint32_t output_field_size, converter_func_ptr converter, void *converter_arg) 190 | int infer_types(FieldList *fields) 191 | 192 | 193 | cdef extern from 'text_adapter.h': 194 | ctypedef enum AdapterError: 195 | ADAPTER_SUCCESS 196 | ADAPTER_ERROR_SEEK 197 | ADAPTER_ERROR_SEEK_EOF 198 | ADAPTER_ERROR_SEEK_S3 199 | ADAPTER_ERROR_READ 200 | ADAPTER_ERROR_READ_EOF 201 | ADAPTER_ERROR_READ_S3 202 | ADAPTER_ERROR_NO_FIELDS 203 | ADAPTER_ERROR_CONVERT 204 | ADAPTER_ERROR_INDEX 205 | ADAPTER_ERROR_PROCESS_TOKEN 206 | ADAPTER_ERROR_READ_TOKENS 207 | ADAPTER_ERROR_READ_RECORDS 208 | ADAPTER_ERROR_JSON 209 | ADAPTER_ERROR_INVALID_CHAR_CODE 210 | ADAPTER_ERROR_LAST 211 | 212 | 213 | ctypedef AdapterError (*read_func_ptr)(void *input, char *buffer, 214 | uint64_t len, uint64_t *num_bytes_read) 215 | ctypedef AdapterError (*seek_func_ptr)(void *input, uint64_t offset) 216 | ctypedef void (*close_func_ptr)(InputData *input) 217 | ctypedef AdapterError (*tokenize_func_ptr)(text_adapter_t *adapter, 218 | uint64_t num_tokens, uint64_t step, char **output, 219 | uint64_t *num_tokens_found, int enable_index, uint64_t index_density) 220 | 221 | ctypedef struct InputData: 222 | void *input 223 | read_func_ptr read 224 | seek_func_ptr seek 225 | close_func_ptr close 226 | void *compressed_input 227 | char *compressed_prebuffer 228 | read_func_ptr read_compressed 229 | seek_func_ptr seek_compressed 230 | get_gzip_access_point_func_ptr get_gzip_access_point 231 | uint64_t header 232 | uint64_t footer 233 | uint64_t start_record 234 | uint64_t start_offset 235 | void *index 236 | 237 | ctypedef struct MemMapInput: 238 | char *data 239 | uint64_t size 240 | uint64_t position 241 | 242 | ctypedef struct GzipInput: 243 | z_stream *z 244 | uint32_t compressed_bytes_processed 245 | uint64_t uncompressed_bytes_processed 246 | int buffer_refreshed 247 | void *uncompressed_input 248 | 249 | ctypedef struct JsonTokenizerArgs: 250 | JSON_checker_struct *jc 251 | 252 | ctypedef struct RegexTokenizerArgs: 253 | pcre *pcre_regex 254 | pcre_extra *extra_regex 255 | 256 | ctypedef struct ConvertErrorInfo: 257 | ConvertError convert_result 258 | char *token 259 | uint64_t record_num 260 | uint64_t field_num 261 | 262 | struct text_adapter_t: 263 | char delim_char 264 | char comment_char 265 | char quote_char 266 | char escape_char 267 | uint64_t num_records 268 | InputData *input_data 269 | tokenize_func_ptr tokenize 270 | void *tokenize_args 271 | uint64_t *field_widths 272 | void *index 273 | uint64_t index_density 274 | indexer_func_ptr indexer 275 | index_lookup_func_ptr index_lookup 276 | add_gzip_access_point_func_ptr add_gzip_access_point 277 | int infer_types_mode 278 | FieldList *fields 279 | int group_whitespace_delims 280 | int any_whitespace_as_delim 281 | int skipblanklines 282 | int reset_json_args 283 | 284 | AdapterError delim_tokenizer(text_adapter_t *adapter, uint64_t num_tokens, 285 | uint64_t step, char **output, uint64_t *num_tokens_found, 286 | int enable_index, uint64_t index_density) 287 | AdapterError json_tokenizer(text_adapter_t *adapter, uint64_t num_tokens, 288 | uint64_t step, char **output, uint64_t *num_tokens_found, 289 | int enable_index, uint64_t index_density) 290 | AdapterError json_record_tokenizer(text_adapter_t *adapter, uint64_t num_tokens, 291 | uint64_t step, char **output, uint64_t *num_tokens_found, 292 | int enable_index, uint64_t index_density) 293 | AdapterError regex_tokenizer(text_adapter_t *adapter, uint64_t num_tokens, 294 | uint64_t step, char **output, uint64_t *num_tokens_found, 295 | int enable_index, uint64_t index_density) 296 | AdapterError fixed_width_tokenizer(text_adapter_t *adapter, 297 | uint64_t num_tokens, uint64_t step, char **output, 298 | uint64_t *num_tokens_found, int enable_index, uint64_t index_density) 299 | AdapterError record_tokenizer(text_adapter_t *adapter, uint64_t num_tokens, 300 | uint64_t step, char **output, uint64_t *num_tokens_found, 301 | int enable_index, uint64_t index_density) 302 | AdapterError line_tokenizer(text_adapter_t *adapter, uint64_t num_tokens, 303 | uint64_t step, char **output, uint64_t *num_tokens_found, 304 | int enable_index, uint64_t index_density) 305 | 306 | AdapterError build_index(text_adapter_t *adapter) 307 | AdapterError build_gzip_index(text_adapter_t *adapter) 308 | 309 | text_adapter_t* open_text_adapter(InputData *input_data) 310 | 311 | void close_text_adapter(text_adapter_t *adapter) 312 | 313 | AdapterError seek_record(text_adapter_t *t, uint64_t rec_num) 314 | AdapterError seek_offset(text_adapter_t *t, uint64_t offset) 315 | AdapterError read_records(text_adapter_t *adapter, uint64_t num_records, 316 | uint64_t step, char *output, uint64_t *num_records_found) 317 | 318 | ConvertErrorInfo get_error_info() 319 | 320 | 321 | # NOTE: This is after "text_adapter.h" so that 322 | # PCRE_STATIC gets defined before including pcre.h. 323 | # This is necessary for the Windows build. 324 | cdef extern from "pcre.h": 325 | struct pcre 326 | struct pcre_extra 327 | pcre* pcre_compile(char *, int, char **, int *, unsigned char *) 328 | pcre_extra* pcre_study(pcre *, int, char **) 329 | 330 | cdef extern from "zlib.h": 331 | ctypedef struct z_stream: 332 | pass 333 | 334 | cdef extern from "json_tokenizer.h": 335 | struct JSON_checker_struct 336 | JSON_checker_struct* new_JSON_checker(int depth) 337 | 338 | cdef extern converter_func_ptr default_converters[NUM_CONVERTER_FUNCS] 339 | -------------------------------------------------------------------------------- /textadapter/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContinuumIO/TextAdapter/53138c2277cdfcf32e127251313d4f77f81050aa/textadapter/core/__init__.py -------------------------------------------------------------------------------- /textadapter/core/index.h: -------------------------------------------------------------------------------- 1 | #ifndef INDEX_H 2 | #define INDEX_H 3 | 4 | #include <_stdint.h> 5 | 6 | 7 | /* buffer size of uncompressed gzip data */ 8 | #define UNCOMPRESSED_WINDOW_SIZE 32768 9 | 10 | /* Default index density value. Density value determines how many records 11 | to skip between each indexed record */ 12 | #define DEFAULT_INDEX_DENSITY 1000 13 | 14 | /* Default distance in bytes between gzip access points */ 15 | #define GZIP_ACCESS_POINT_DISTANCE 1024 * 1024 16 | 17 | 18 | typedef struct record_offset_t 19 | { 20 | uint64_t record_num; 21 | uint64_t offset; 22 | } RecordOffset; 23 | 24 | 25 | typedef struct gzip_index_access_point_t 26 | { 27 | uint8_t bits; 28 | uint64_t compressed_offset; 29 | uint64_t uncompressed_offset; 30 | unsigned char window[UNCOMPRESSED_WINDOW_SIZE]; 31 | } GzipIndexAccessPoint; 32 | 33 | 34 | /* indexer function pointer type */ 35 | typedef void (*indexer_func_ptr)(void *index, uint64_t record_num, 36 | uint64_t record_offset); 37 | 38 | typedef RecordOffset (*index_lookup_func_ptr)(void *index, uint64_t record_num); 39 | 40 | /* add gzip access point function pointer type */ 41 | typedef void (*add_gzip_access_point_func_ptr)(void *index, 42 | unsigned char *buffer, 43 | uint32_t compressed_offset, 44 | uint64_t uncompressed_offset, 45 | int avail_in, 46 | int avail_out, 47 | uint8_t data_type); 48 | 49 | typedef void (*get_gzip_access_point_func_ptr)(void *index, 50 | uint64_t offset, 51 | GzipIndexAccessPoint *point); 52 | 53 | void indexer_callback(void *index, uint64_t record_num, uint64_t record_offset); 54 | RecordOffset index_lookup_callback(void *index, uint64_t record_num); 55 | 56 | void add_gzip_access_point_callback(void *index, 57 | unsigned char *window, 58 | uint32_t compressed_offset, 59 | uint64_t uncompressed_offset, 60 | int avail_in, 61 | int avail_out, 62 | uint8_t bits); 63 | 64 | void get_gzip_access_point_callback(void *index, 65 | uint64_t offset, 66 | GzipIndexAccessPoint *point); 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /textadapter/core/io.h: -------------------------------------------------------------------------------- 1 | #include "text_adapter.h" 2 | 3 | AdapterError seek_s3(InputData *input, uint64_t offset); 4 | AdapterError read_s3(InputData *input, char *buffer, uint64_t buffer_len, 5 | uint64_t *num_bytes_read); 6 | -------------------------------------------------------------------------------- /textadapter/core/io_functions.h: -------------------------------------------------------------------------------- 1 | #ifndef IO_FUNCTIONS_H 2 | #define IO_FUNCTIONS_H 3 | 4 | #include "text_adapter.h" 5 | 6 | /* default file read/seek functions */ 7 | InputData* open_file(const char *filename); 8 | void close_file(InputData *input); 9 | AdapterError seek_file(InputData *input, uint64_t offset); 10 | AdapterError read_file(InputData *input, 11 | char *buffer, uint64_t len, uint64_t *num_bytes_read); 12 | 13 | /* memmap read/seek functions */ 14 | InputData* open_memmap(char *data, size_t size); 15 | void close_memmap(InputData *input); 16 | AdapterError seek_memmap(InputData *input, uint64_t offset); 17 | AdapterError read_memmap(InputData *input, 18 | char *buffer, uint64_t len, uint64_t *num_bytes_read); 19 | 20 | /* gzip read/seek functions */ 21 | AdapterError seek_gzip(InputData *input, uint64_t offset); 22 | AdapterError read_gzip(InputData *input, 23 | char *buffer, uint64_t len, uint64_t *num_bytes_read); 24 | 25 | /* setup/teardown functions for gzip decompression data structures */ 26 | void init_gzip(InputData *input); 27 | void close_gzip(InputData *input); 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /textadapter/core/json_tokenizer.c: -------------------------------------------------------------------------------- 1 | /* Adapted from json.org JSON_checker */ 2 | 3 | /* 2007-08-24 */ 4 | 5 | /* 6 | Copyright (c) 2005 JSON.org 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | The Software shall be used for Good, not Evil. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | */ 28 | 29 | #include 30 | #include "json_tokenizer.h" 31 | 32 | #define true 1 33 | #define false 0 34 | 35 | /* 36 | Characters are mapped into these 31 character classes. This allows for 37 | a significant reduction in the size of the state transition table. 38 | */ 39 | 40 | int 41 | reject(JSON_checker jc) 42 | { 43 | /* 44 | Delete the JSON_checker object. 45 | */ 46 | free((void*)jc->stack); 47 | free((void*)jc); 48 | return false; 49 | } 50 | 51 | 52 | int 53 | push(JSON_checker jc, int mode) 54 | { 55 | /* 56 | Push a mode onto the stack. Return false if there is overflow. 57 | */ 58 | jc->top += 1; 59 | if (jc->top >= jc->depth) { 60 | return false; 61 | } 62 | jc->stack[jc->top] = mode; 63 | return true; 64 | } 65 | 66 | 67 | int 68 | pop(JSON_checker jc, int mode) 69 | { 70 | /* 71 | Pop the stack, assuring that the current mode matches the expectation. 72 | Return false if there is underflow or if the modes mismatch. 73 | */ 74 | if (jc->top < 0 || jc->stack[jc->top] != mode) { 75 | return false; 76 | } 77 | jc->top -= 1; 78 | return true; 79 | } 80 | 81 | 82 | JSON_checker 83 | new_JSON_checker(int depth) 84 | { 85 | /* 86 | new_JSON_checker starts the checking process by constructing a JSON_checker 87 | object. It takes a depth parameter that restricts the level of maximum 88 | nesting. 89 | 90 | To continue the process, call JSON_checker_char for each character in the 91 | JSON text, and then call JSON_checker_done to obtain the final result. 92 | These functions are fully reentrant. 93 | 94 | The JSON_checker object will be deleted by JSON_checker_done. 95 | JSON_checker_char will delete the JSON_checker object if it sees an error. 96 | */ 97 | JSON_checker jc = (JSON_checker)malloc(sizeof(struct JSON_checker_struct)); 98 | jc->state = GO; 99 | jc->depth = depth; 100 | jc->top = -1; 101 | jc->stack = (int*)calloc(depth, sizeof(int)); 102 | push(jc, MODE_DONE); 103 | return jc; 104 | } 105 | 106 | 107 | int 108 | JSON_checker_char(JSON_checker jc, int next_char) 109 | { 110 | /* 111 | After calling new_JSON_checker, call this function for each character (or 112 | partial character) in your JSON text. It can accept UTF-8, UTF-16, or 113 | UTF-32. It returns true if things are looking ok so far. If it rejects the 114 | text, it deletes the JSON_checker object and returns false. 115 | */ 116 | int next_class, next_state; 117 | /* 118 | Determine the character's class. 119 | */ 120 | if (next_char < 0) { 121 | return reject(jc); 122 | } 123 | if (next_char >= 128) { 124 | next_class = C_ETC; 125 | } else { 126 | next_class = ascii_class[next_char]; 127 | if (next_class <= __) { 128 | return reject(jc); 129 | } 130 | } 131 | /* 132 | Get the next state from the state transition table. 133 | */ 134 | next_state = state_transition_table[jc->state][next_class]; 135 | if (next_state >= 0) { 136 | /* 137 | Change the state. 138 | */ 139 | jc->state = next_state; 140 | } else { 141 | /* 142 | Or perform one of the actions. 143 | */ 144 | switch (next_state) { 145 | /* empty } */ 146 | case -33: 147 | jc->state = IN; 148 | break; 149 | case -9: 150 | if (!pop(jc, MODE_KEY)) { 151 | return reject(jc); 152 | } 153 | jc->state = OK; 154 | break; 155 | 156 | /* } */ case -8: 157 | if (!pop(jc, MODE_OBJECT)) { 158 | return reject(jc); 159 | } 160 | jc->state = OK; 161 | break; 162 | 163 | /* ] */ case -7: 164 | if (!pop(jc, MODE_ARRAY)) { 165 | return reject(jc); 166 | } 167 | jc->state = OK; 168 | break; 169 | 170 | /* { */ case -6: 171 | if (!push(jc, MODE_KEY)) { 172 | return reject(jc); 173 | } 174 | jc->state = OB; 175 | break; 176 | 177 | /* [ */ case -5: 178 | if (!push(jc, MODE_ARRAY)) { 179 | return reject(jc); 180 | } 181 | jc->state = AR; 182 | break; 183 | 184 | /* " */ case -4: 185 | switch (jc->stack[jc->top]) { 186 | case MODE_KEY: 187 | jc->state = CO; 188 | break; 189 | case MODE_ARRAY: 190 | case MODE_OBJECT: 191 | jc->state = OK; 192 | break; 193 | default: 194 | return reject(jc); 195 | } 196 | break; 197 | 198 | /* , */ case -3: 199 | switch (jc->stack[jc->top]) { 200 | case MODE_OBJECT: 201 | /* 202 | A comma causes a flip from object mode to key mode. 203 | */ 204 | if (!pop(jc, MODE_OBJECT) || !push(jc, MODE_KEY)) { 205 | return reject(jc); 206 | } 207 | jc->state = KE; 208 | break; 209 | case MODE_ARRAY: 210 | jc->state = VA; 211 | break; 212 | default: 213 | return reject(jc); 214 | } 215 | break; 216 | 217 | /* : */ case -2: 218 | /* 219 | A colon causes a flip from key mode to object mode. 220 | */ 221 | if (!pop(jc, MODE_KEY) || !push(jc, MODE_OBJECT)) { 222 | return reject(jc); 223 | } 224 | jc->state = VA; 225 | break; 226 | /* 227 | Bad action. 228 | */ 229 | default: 230 | return reject(jc); 231 | } 232 | } 233 | return true; 234 | } 235 | 236 | 237 | int 238 | JSON_checker_done(JSON_checker jc) 239 | { 240 | /* 241 | The JSON_checker_done function should be called after all of the characters 242 | have been processed, but only if every call to JSON_checker_char returned 243 | true. This function deletes the JSON_checker and returns true if the JSON 244 | text was accepted. 245 | */ 246 | int result = (jc->state == OK || jc->state == NO) && pop(jc, MODE_DONE); 247 | reject(jc); 248 | return result; 249 | } 250 | -------------------------------------------------------------------------------- /textadapter/core/json_tokenizer.h: -------------------------------------------------------------------------------- 1 | /* Adapted from json.org JSON_checker */ 2 | 3 | #ifndef JSON_TOKENIZER_H 4 | #define JSON_TOKENIZER_H 5 | 6 | #define __ -1 /* the universal error code */ 7 | 8 | enum classes { 9 | C_SPACE, /* space */ 10 | C_NEWLINE, /* newline */ 11 | C_WHITE, /* other whitespace */ 12 | C_LCURB, /* { */ 13 | C_RCURB, /* } */ 14 | C_LSQRB, /* [ */ 15 | C_RSQRB, /* ] */ 16 | C_COLON, /* : */ 17 | C_COMMA, /* , */ 18 | C_QUOTE, /* " */ 19 | C_BACKS, /* \ */ 20 | C_SLASH, /* / */ 21 | C_PLUS, /* + */ 22 | C_MINUS, /* - */ 23 | C_POINT, /* . */ 24 | C_ZERO , /* 0 */ 25 | C_DIGIT, /* 123456789 */ 26 | C_LOW_A, /* a */ 27 | C_LOW_B, /* b */ 28 | C_LOW_C, /* c */ 29 | C_LOW_D, /* d */ 30 | C_LOW_E, /* e */ 31 | C_LOW_F, /* f */ 32 | C_LOW_L, /* l */ 33 | C_LOW_N, /* n */ 34 | C_LOW_R, /* r */ 35 | C_LOW_S, /* s */ 36 | C_LOW_T, /* t */ 37 | C_LOW_U, /* u */ 38 | C_ABCDF, /* ABCDF */ 39 | C_E, /* E */ 40 | C_ETC, /* everything else */ 41 | NR_CLASSES 42 | }; 43 | 44 | static int ascii_class[128] = { 45 | /* 46 | This array maps the 128 ASCII characters into character classes. 47 | The remaining Unicode characters should be mapped to C_ETC. 48 | Non-whitespace control characters are errors. 49 | */ 50 | __, __, __, __, __, __, __, __, 51 | __, C_WHITE, C_NEWLINE, __, __, C_WHITE, __, __, 52 | __, __, __, __, __, __, __, __, 53 | __, __, __, __, __, __, __, __, 54 | 55 | C_SPACE, C_ETC, C_QUOTE, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, 56 | C_ETC, C_ETC, C_ETC, C_PLUS, C_COMMA, C_MINUS, C_POINT, C_SLASH, 57 | C_ZERO, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, 58 | C_DIGIT, C_DIGIT, C_COLON, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, 59 | 60 | C_ETC, C_ABCDF, C_ABCDF, C_ABCDF, C_ABCDF, C_E, C_ABCDF, C_ETC, 61 | C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, 62 | C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, 63 | C_ETC, C_ETC, C_ETC, C_LSQRB, C_BACKS, C_RSQRB, C_ETC, C_ETC, 64 | 65 | C_ETC, C_LOW_A, C_LOW_B, C_LOW_C, C_LOW_D, C_LOW_E, C_LOW_F, C_ETC, 66 | C_ETC, C_ETC, C_ETC, C_ETC, C_LOW_L, C_ETC, C_LOW_N, C_ETC, 67 | C_ETC, C_ETC, C_LOW_R, C_LOW_S, C_LOW_T, C_LOW_U, C_ETC, C_ETC, 68 | C_ETC, C_ETC, C_ETC, C_LCURB, C_ETC, C_RCURB, C_ETC, C_ETC 69 | }; 70 | 71 | 72 | /* 73 | The state codes. 74 | */ 75 | enum states { 76 | GO, /* start */ 77 | OK, /* ok */ 78 | OB, /* object */ 79 | KE, /* key */ 80 | CO, /* colon */ 81 | VA, /* value */ 82 | AR, /* array */ 83 | ST, /* string */ 84 | ES, /* escape */ 85 | U1, /* u1 */ 86 | U2, /* u2 */ 87 | U3, /* u3 */ 88 | U4, /* u4 */ 89 | MI, /* minus */ 90 | ZE, /* zero */ 91 | IN, /* integer */ 92 | FR, /* fraction */ 93 | E1, /* e */ 94 | E2, /* ex */ 95 | E3, /* exp */ 96 | T1, /* tr */ 97 | T2, /* tru */ 98 | T3, /* true */ 99 | F1, /* fa */ 100 | F2, /* fal */ 101 | F3, /* fals */ 102 | F4, /* false */ 103 | N1, /* nu */ 104 | N2, /* nul */ 105 | N3, /* null */ 106 | NO, /* next object */ 107 | NR_STATES 108 | }; 109 | 110 | 111 | static int state_transition_table[NR_STATES][NR_CLASSES] = { 112 | /* 113 | The state transition table takes the current state and the current symbol, 114 | and returns either a new state or an action. An action is represented as a 115 | negative number. A JSON text is accepted if at the end of the text the 116 | state is OK and if the mode is MODE_DONE. 117 | 118 | newline white 1-9 ABCDF etc 119 | space | | { } [ ] : , " \ / + - . 0 | a b c d e f l n r s t u | E | */ 120 | /*start GO*/ { GO, __, GO, -6, __, -5, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __}, 121 | /*ok OK*/ { OK, NO, OK, __, -8, __, -7, __, -3, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __}, 122 | /*object OB*/ { OB, __, OB, __, -9, __, __, __, __, ST, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __}, 123 | /*key KE*/ { KE, __, KE, __, __, __, __, __, __, ST, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __}, 124 | /*colon CO*/ { CO, __, CO, __, __, __, __, -2, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __}, 125 | /*value VA*/ { VA, VA, VA, -6, __, -5, __, __, __,-11, __, __, __, MI, __,-16,-10, __, __, __, __, __,-14, __,-14, __, __,-14, __, __, __, __}, 126 | /*array AR*/ { AR, __, AR, -6, __, -5, -7, __, __,-11, __, __, __, MI, __,-16,-10, __, __, __, __, __,-14, __,-14, __, __,-14, __, __, __, __}, 127 | /*string ST*/ { ST, __, __, ST, ST, ST, ST, ST, ST, -4, ES, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST}, 128 | /*escape ES*/ { __, __, __, __, __, __, __, __, __, ST, ST, ST, __, __, __, __, __, __, ST, __, __, __, ST, __, ST, ST, __, ST, U1, __, __, __}, 129 | /*u1 U1*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, U2, U2, U2, U2, U2, U2, U2, U2, __, __, __, __, __, __, U2, U2, __}, 130 | /*u2 U2*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, U3, U3, U3, U3, U3, U3, U3, U3, __, __, __, __, __, __, U3, U3, __}, 131 | /*u3 U3*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, U4, U4, U4, U4, U4, U4, U4, U4, __, __, __, __, __, __, U4, U4, __}, 132 | /*u4 U4*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, ST, ST, ST, ST, ST, ST, ST, ST, __, __, __, __, __, __, ST, ST, __}, 133 | /*minus MI*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __,-16, IN, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __}, 134 | /*zero ZE*/ { OK, __, OK, __, -8, __, -7, __, -3, __, __, __, __, __, FR, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __}, 135 | /*int IN*/ { OK, __, OK, __, -8, __, -7, __, -3, __, __, __, __, __, FR, IN, IN, __, __, __, __, E1, __, __, __, __, __, __, __, __, E1, __}, 136 | /*frac FR*/ { OK, __, OK, __, -8, __, -7, __, -3, __, __, __, __, __, __, FR, FR, __, __, __, __, E1, __, __, __, __, __, __, __, __, E1, __}, 137 | /*e E1*/ { __, __, __, __, __, __, __, __, __, __, __, __, E2, E2, __, E3, E3, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __}, 138 | /*ex E2*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, E3, E3, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __}, 139 | /*exp E3*/ { OK, __, OK, __, -8, __, -7, __, -3, __, __, __, __, __, __, E3, E3, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __}, 140 | /*tr T1*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, T2, __, __, __, __, __, __}, 141 | /*tru T2*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, T3, __, __, __}, 142 | /*true T3*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __,-15, __, __, __, __, __, __, __, __, __, __}, 143 | /*fa F1*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, F2, __, __, __, __, __, __, __, __, __, __, __, __, __, __}, 144 | /*fal F2*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, F3, __, __, __, __, __, __, __, __}, 145 | /*fals F3*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, F4, __, __, __, __, __}, 146 | /*false F4*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __,-15, __, __, __, __, __, __, __, __, __, __}, 147 | /*nu N1*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, N2, __, __, __}, 148 | /*nul N2*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, N3, __, __, __, __, __, __, __, __}, 149 | /*null N3*/ { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __,-15, __, __, __, __, __, __, __, __}, 150 | /*next NO*/ { __, NO, __, -6, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __}, 151 | }; 152 | 153 | 154 | /* 155 | These modes can be pushed on the stack. 156 | */ 157 | enum modes { 158 | MODE_ARRAY, 159 | MODE_DONE, 160 | MODE_KEY, 161 | MODE_OBJECT, 162 | }; 163 | 164 | typedef struct JSON_checker_struct { 165 | int state; 166 | int depth; 167 | int top; 168 | int* stack; 169 | } * JSON_checker; 170 | 171 | 172 | extern JSON_checker new_JSON_checker(int depth); 173 | extern int JSON_checker_char(JSON_checker jc, int next_char); 174 | extern int JSON_checker_done(JSON_checker jc); 175 | 176 | #endif 177 | -------------------------------------------------------------------------------- /textadapter/core/loadtxt.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import operator 3 | import textadapter 4 | 5 | from numpy.compat import ( 6 | asstr, bytes, basestring, unicode 7 | ) 8 | 9 | 10 | def loadtxt(fname, dtype=float, comments='#', delimiter=None, 11 | converters=None, skiprows=0, usecols=None, unpack=False, 12 | ndmin=0): 13 | """ 14 | Load data from a text file. 15 | 16 | Each row in the text file must have the same number of values. 17 | 18 | Parameters 19 | ---------- 20 | fname : file or str 21 | File, filename, or generator to read. If the filename extension is 22 | ``.gz`` or ``.bz2``, the file is first decompressed. Note that 23 | generators should return byte strings for Python 3k. 24 | dtype : data-type, optional 25 | Data-type of the resulting array; default: float. If this is a 26 | record data-type, the resulting array will be 1-dimensional, and 27 | each row will be interpreted as an element of the array. In this 28 | case, the number of columns used must match the number of fields in 29 | the data-type. 30 | comments : str, optional 31 | The character used to indicate the start of a comment; 32 | default: '#'. 33 | delimiter : str, optional 34 | The string used to separate values. By default, this is any 35 | whitespace. 36 | converters : dict, optional 37 | A dictionary mapping column number to a function that will convert 38 | that column to a float. E.g., if column 0 is a date string: 39 | ``converters = {0: datestr2num}``. Converters can also be used to 40 | provide a default value for missing data (but see also `genfromtxt`): 41 | ``converters = {3: lambda s: float(s.strip() or 0)}``. Default: None. 42 | skiprows : int, optional 43 | Skip the first `skiprows` lines; default: 0. 44 | usecols : sequence, optional 45 | Which columns to read, with 0 being the first. For example, 46 | ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns. 47 | The default, None, results in all columns being read. 48 | unpack : bool, optional 49 | If True, the returned array is transposed, so that arguments may be 50 | unpacked using ``x, y, z = loadtxt(...)``. When used with a record 51 | data-type, arrays are returned for each field. Default is False. 52 | ndmin : int, optional 53 | The returned array will have at least `ndmin` dimensions. 54 | Otherwise mono-dimensional axes will be squeezed. 55 | Legal values: 0 (default), 1 or 2. 56 | .. versionadded:: 1.6.0 57 | 58 | Returns 59 | ------- 60 | out : ndarray 61 | Data read from the text file. 62 | 63 | See Also 64 | -------- 65 | load, fromstring, fromregex 66 | genfromtxt : Load data with missing values handled as specified. 67 | scipy.io.loadmat : reads MATLAB data files 68 | 69 | Notes 70 | ----- 71 | This function aims to be a fast reader for simply formatted files. The 72 | `genfromtxt` function provides more sophisticated handling of, e.g., 73 | lines with missing values. 74 | 75 | Examples 76 | -------- 77 | >>> from StringIO import StringIO # StringIO behaves like a file object 78 | >>> c = StringIO("0 1\\n2 3") 79 | >>> np.loadtxt(c) 80 | array([[ 0., 1.], 81 | [ 2., 3.]]) 82 | 83 | >>> d = StringIO("M 21 72\\nF 35 58") 84 | >>> np.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'), 85 | ... 'formats': ('S1', 'i4', 'f4')}) 86 | array([('M', 21, 72.0), ('F', 35, 58.0)], 87 | dtype=[('gender', '|S1'), ('age', '>> c = StringIO("1,0,2\\n3,0,4") 90 | >>> x, y = np.loadtxt(c, delimiter=',', usecols=(0, 2), unpack=True) 91 | >>> x 92 | array([ 1., 3.]) 93 | >>> y 94 | array([ 2., 4.]) 95 | 96 | """ 97 | 98 | user_converters = converters 99 | 100 | whitespace_delims = False 101 | if delimiter is None: 102 | whitespace_delims = True 103 | 104 | compression = None 105 | if isinstance(fname, basestring) and fname[-3:] == '.gz': 106 | compression = 'gzip' 107 | 108 | try: 109 | adapter = textadapter.text_adapter(fname, parser='csv', delimiter=delimiter, 110 | comment=comments, header=skiprows, compression=compression, whitespace_delims=whitespace_delims, 111 | field_names=False, infer_types=False) 112 | except EOFError: 113 | array = numpy.array([], dtype=numpy.int64, ndmin=ndmin) 114 | if ndmin == 2: 115 | array = array.T 116 | return array 117 | 118 | if usecols is None: 119 | usecols = [x for x in range(0, adapter.field_count)] 120 | elif isinstance(usecols, numpy.ndarray): 121 | usecols = usecols.tolist() 122 | else: 123 | usecols = list(usecols) 124 | 125 | # create valid dtype object 126 | if isinstance(dtype, (list, tuple)): 127 | dtype = [dt if isinstance(dt, tuple) else ('', dt) for dt in dtype] 128 | dtype = numpy.dtype(dtype) 129 | 130 | # create list of dtypes to send to TextAdapter 131 | if dtype.names is None: 132 | # create list of homogenous scalar dtypes from single scalar dtype 133 | numFields = len(usecols) 134 | dtypes = [dtype]*numFields 135 | fieldNames = None 136 | else: 137 | # create list of scalar dtypes from struct dtype 138 | dtypes, fieldNames = unpack_dtype(dtype) 139 | 140 | if fieldNames is not None: 141 | list_names = ['' for x in range(adapter.field_count)] 142 | for i, col in enumerate(usecols): 143 | list_names[col] = fieldNames[i] 144 | adapter.field_names = list_names 145 | 146 | adapter.set_field_types(types=dict(zip(usecols, dtypes))) 147 | 148 | if converters is not None: 149 | for field, converter in converters.items(): 150 | adapter.set_converter(field, converter) 151 | 152 | array = adapter[usecols][:] 153 | 154 | if dtype.fields is not None and numpy.object_ not in [dt[0] for dt in array.dtype.fields.values()]: 155 | array.dtype = dtype 156 | elif dtype.fields is None: 157 | array.dtype = dtype 158 | if dtype.names is None: 159 | if adapter.field_count == 0: 160 | array.shape = (adapter.size,) 161 | else: 162 | array.shape = (adapter.size, len(usecols)) 163 | 164 | # Multicolumn data are returned with shape (1, N, M), i.e. 165 | # (1, 1, M) for a single row - remove the singleton dimension there 166 | if array.ndim == 3 and array.shape[:2] == (1, 1): 167 | array.shape = (1, -1) 168 | 169 | # Verify that the array has at least dimensions `ndmin`. 170 | # Check correctness of the values of `ndmin` 171 | if not ndmin in [0, 1, 2]: 172 | raise ValueError('Illegal value of ndmin keyword: %s' % ndmin) 173 | 174 | # Tweak the size and shape of the arrays - remove extraneous dimensions 175 | if array.ndim > ndmin: 176 | array = numpy.squeeze(array) 177 | 178 | # and ensure we have the minimum number of dimensions asked for 179 | # - has to be in this order for the odd case ndmin=1, array.squeeze().ndim=0 180 | if array.ndim < ndmin: 181 | if ndmin == 1: 182 | array = numpy.atleast_1d(array) 183 | elif ndmin == 2: 184 | array = numpy.atleast_2d(array).T 185 | 186 | if unpack: 187 | if len(dtype) > 1: 188 | # For structured arrays, return an array for each field. 189 | return [array[field] for field in dtype.names] 190 | else: 191 | return array.T 192 | else: 193 | return array 194 | 195 | 196 | def unpack_dtype(dtype): 197 | dtypes = [] 198 | names = [] 199 | for name in dtype.names: 200 | if dtype.fields[name][0].names is None: 201 | count = 1 202 | shape = dtype.fields[name][0].shape 203 | if len(shape) > 0: 204 | count = 1 205 | for s in shape: 206 | count = count * s 207 | if count == 0 or count == 1: 208 | dtypes.append(dtype.fields[name][0].base) 209 | names.append(name) 210 | else: 211 | for x in range(count): 212 | dtypes.append(dtype.fields[name][0].base) 213 | names.append('') 214 | else: 215 | nested_dtypes, nested_names = unpack_dtype(dtype.fields[name][0]) 216 | for dt in nested_dtypes: 217 | dtypes.append(dt) 218 | for n in nested_names: 219 | names.append(n) 220 | return dtypes, names 221 | 222 | 223 | -------------------------------------------------------------------------------- /textadapter/core/text_adapter.h: -------------------------------------------------------------------------------- 1 | #ifndef TEXTADAPTER_H 2 | #define TEXTADAPTER_H 3 | 4 | #ifdef _WIN32 5 | #define PCRE_STATIC 6 | #endif 7 | 8 | #include 9 | #include 10 | #include 11 | #include "converter_functions.h" 12 | #include "index.h" 13 | #include "field_info.h" 14 | #include "json_tokenizer.h" 15 | 16 | 17 | /* Buffer size for reading in compressed gzip data before uncompressing */ 18 | #define COMPRESSED_BUFFER_SIZE 1024*1024 19 | 20 | 21 | /* TextAdapter error codes */ 22 | typedef enum 23 | { 24 | ADAPTER_SUCCESS, 25 | ADAPTER_ERROR_SEEK, 26 | ADAPTER_ERROR_SEEK_EOF, 27 | ADAPTER_ERROR_SEEK_GZIP, 28 | ADAPTER_ERROR_SEEK_S3, 29 | ADAPTER_ERROR_READ, 30 | ADAPTER_ERROR_READ_EOF, 31 | ADAPTER_ERROR_READ_GZIP, 32 | ADAPTER_ERROR_READ_S3, 33 | ADAPTER_ERROR_NO_FIELDS, 34 | ADAPTER_ERROR_CONVERT, 35 | ADAPTER_ERROR_INDEX, 36 | ADAPTER_ERROR_PROCESS_TOKEN, 37 | ADAPTER_ERROR_READ_TOKENS, 38 | ADAPTER_ERROR_READ_RECORDS, 39 | ADAPTER_ERROR_JSON, 40 | ADAPTER_ERROR_INVALID_CHAR_CODE, 41 | ADAPTER_ERROR_LAST 42 | } AdapterError; 43 | 44 | 45 | typedef enum tokenizer_state 46 | { 47 | DEFAULT_STATE, 48 | RECORD_STATE, 49 | RECORD_END_STATE, 50 | COMMENT_STATE, 51 | QUOTE_STATE, 52 | QUOTE_END_STATE, 53 | PROCESS_STATE, 54 | ESCAPE_STATE 55 | } TokenizerState; 56 | 57 | 58 | typedef struct text_adapter_t TextAdapter; 59 | typedef struct input_data_t InputData; 60 | 61 | 62 | /* read function type for reading blocks of text from data source */ 63 | typedef AdapterError (*read_func_ptr)(InputData *input, 64 | char *buffer, uint64_t len, uint64_t *num_bytes_read); 65 | 66 | /* seek function type for seeking to position in data source */ 67 | typedef AdapterError (*seek_func_ptr)(InputData *input, 68 | uint64_t offset); 69 | 70 | /* cleans up any handles or pointers involved in reading from data source */ 71 | typedef void (*close_func_ptr)(InputData *input); 72 | 73 | /* tokenize function for parsing text buffer and finding fields appropriate 74 | converter function should be called for each field that is found */ 75 | typedef AdapterError (*tokenize_func_ptr)(TextAdapter *adapter, 76 | uint64_t num_tokens, uint64_t step, char **output, 77 | uint64_t *num_tokens_found, int enable_index, uint64_t index_density); 78 | 79 | 80 | struct input_data_t 81 | { 82 | void *input; 83 | 84 | /* retrieves data chunks from data source and stores in buffer */ 85 | read_func_ptr read; 86 | 87 | /* seeks to new position in data source */ 88 | seek_func_ptr seek; 89 | 90 | /* cleans up any handles or pointers involved in reading from data source */ 91 | close_func_ptr close; 92 | 93 | void *compressed_input; 94 | 95 | char *compressed_prebuffer; 96 | 97 | /* retrieves and decompresses data chunks from compressed data source 98 | and stores in buffer */ 99 | read_func_ptr read_compressed; 100 | 101 | /* seeks to new position in compressed data source */ 102 | seek_func_ptr seek_compressed; 103 | 104 | /* Retreive gzip access point from index */ 105 | get_gzip_access_point_func_ptr get_gzip_access_point; 106 | 107 | /* number of bytes to skip at beginning of data stream */ 108 | uint64_t header; 109 | 110 | /* number of bytes to skip at end of data stream */ 111 | uint64_t footer; 112 | 113 | /* Record where reading is started from after seek */ 114 | uint64_t start_record; 115 | 116 | /* Data offset where reading is started from after seek */ 117 | uint64_t start_offset; 118 | 119 | /* index of record offsets */ 120 | void *index; 121 | 122 | }; 123 | 124 | 125 | typedef struct memmap_input_t 126 | { 127 | char *data; 128 | uint64_t size; 129 | uint64_t position; 130 | } MemMapInput; 131 | 132 | 133 | typedef struct gzip_input_t 134 | { 135 | /* data struct for reading gzipped compressed data */ 136 | z_stream *z; 137 | 138 | uint32_t compressed_bytes_processed; 139 | uint64_t uncompressed_bytes_processed; 140 | int buffer_refreshed; 141 | 142 | /* data struct for reading uncompressed data */ 143 | void *uncompressed_input; 144 | } GzipInput; 145 | 146 | 147 | typedef struct json_tokenizer_args_t 148 | { 149 | struct JSON_checker_struct *jc; 150 | } JsonTokenizerArgs; 151 | 152 | typedef struct regex_tokenizer_args_t 153 | { 154 | pcre *pcre_regex; 155 | struct pcre_extra *extra_regex; 156 | } RegexTokenizerArgs; 157 | 158 | 159 | typedef struct text_adapter_buffer_t 160 | { 161 | char *data; 162 | uint64_t size; 163 | uint64_t bytes_processed; 164 | int eof; 165 | } TextAdapterBuffer; 166 | 167 | 168 | typedef struct convert_error_info_t 169 | { 170 | ConvertError convert_result; 171 | char *token; 172 | uint64_t record_num; 173 | uint64_t field_num; 174 | } ConvertErrorInfo; 175 | 176 | 177 | typedef struct text_adapter_t 178 | { 179 | uint64_t num_records; 180 | 181 | char delim_char; 182 | char comment_char; 183 | char quote_char; 184 | char escape_char; 185 | 186 | /* Setting this to true will treat a series of whitespace 187 | as a single delimiter. Otherwise, each whitespace char 188 | will delimim a single field. */ 189 | int group_whitespace_delims; 190 | int any_whitespace_as_delim; 191 | 192 | int infer_types_mode; 193 | 194 | /* If 0, empty lines will be treated as missing fields. Defaults to 1. */ 195 | int skipblanklines; 196 | 197 | InputData *input_data; 198 | 199 | /* array of field info for each field */ 200 | FieldList *fields; 201 | 202 | /* buffer for storing chunks of data from data source to be parsed */ 203 | TextAdapterBuffer buffer; 204 | 205 | /* parses tokens in buffer */ 206 | tokenize_func_ptr tokenize; 207 | void *tokenize_args; 208 | 209 | /* index of record offsets */ 210 | void *index; 211 | 212 | /* Density of record offsets index. Density value x means every 213 | x-th record is indexed. */ 214 | uint64_t index_density; 215 | 216 | /* function for building additional index info for specific 217 | data stream type */ 218 | indexer_func_ptr indexer; 219 | index_lookup_func_ptr index_lookup; 220 | add_gzip_access_point_func_ptr add_gzip_access_point; 221 | 222 | int reset_json_args; 223 | 224 | } TextAdapter; 225 | 226 | 227 | /* Allocate new Recfile struct and set functions */ 228 | TextAdapter* open_text_adapter(InputData *input_data); 229 | 230 | /* Deallocate Recfile struct */ 231 | void close_text_adapter(TextAdapter *adapter); 232 | 233 | /* Seek to specific record in data source */ 234 | AdapterError seek_record(TextAdapter *adapter, uint64_t rec_num); 235 | 236 | /* Read specified number of records from data source, starting at current 237 | position. Fields in records will be converted to data type and stored in 238 | output buffer. Output buffer should be big enough to store 239 | requested records. */ 240 | AdapterError read_records(TextAdapter *adapter, uint64_t num_records, 241 | uint64_t step, char *output, uint64_t *num_records_found); 242 | 243 | /* default build index function */ 244 | AdapterError build_index(TextAdapter *adapter); 245 | 246 | /* initialize default index info */ 247 | void clear_gzip_index(TextAdapter *adapter); 248 | 249 | /* build index function for gzip files */ 250 | AdapterError build_gzip_index(TextAdapter *adapter); 251 | 252 | /* default tokenize function based on delimiter */ 253 | AdapterError delim_tokenizer(TextAdapter *adapter, uint64_t num_tokens, 254 | uint64_t step, char **output, uint64_t *num_tokens_found, 255 | int enable_index, uint64_t index_density); 256 | 257 | AdapterError json_tokenizer(TextAdapter *adapter, uint64_t num_tokens, 258 | uint64_t step, char **output, uint64_t *num_tokens_found, 259 | int enable_index, uint64_t index_density); 260 | 261 | AdapterError json_record_tokenizer(TextAdapter *adapter, uint64_t num_tokens, 262 | uint64_t step, char **output, uint64_t *num_tokens_found, 263 | int enable_index, uint64_t index_density); 264 | 265 | /* regular expression tokenize function */ 266 | AdapterError regex_tokenizer(TextAdapter *adapter, uint64_t num_tokens, 267 | uint64_t step, char **output, uint64_t *num_tokens_found, 268 | int enable_index, uint64_t index_density); 269 | 270 | /* tokenize function based on predefined field widths */ 271 | AdapterError fixed_width_tokenizer(TextAdapter *adapter, uint64_t num_tokens, 272 | uint64_t step, char **output, uint64_t *num_tokens_found, 273 | int enable_index, uint64_t index_density); 274 | 275 | AdapterError record_tokenizer(TextAdapter *adapter, uint64_t num_tokens, 276 | uint64_t step, char **output, uint64_t *num_tokens_found, 277 | int enable_index, uint64_t index_density); 278 | 279 | AdapterError line_tokenizer(TextAdapter *adapter, uint64_t num_tokens, 280 | uint64_t step, char **output, uint64_t *num_tokens_found, 281 | int enable_index, uint64_t index_density); 282 | 283 | ConvertErrorInfo get_error_info(void); 284 | 285 | 286 | #endif 287 | -------------------------------------------------------------------------------- /textadapter/examples/README: -------------------------------------------------------------------------------- 1 | To run examples, first generate example data using: 2 | 3 | cd ../tests 4 | python generate.py 500 # number of records 5 | -------------------------------------------------------------------------------- /textadapter/examples/basic.py: -------------------------------------------------------------------------------- 1 | import textadapter 2 | 3 | adapter = textadapter.CSVTextAdapter('../tests/data/ints', delimiter=',', field_names=False) 4 | 5 | # Set dtype for each field in record 6 | adapter.set_field_types({0:'u4', 1:'u8', 2:'f4', 3:'f8', 4:'S10'}) 7 | 8 | # Read all records 9 | print(adapter[:]) 10 | 11 | # Read first ten records 12 | print(adapter[0:10]) 13 | 14 | # Change dtype; retrieve only 1st and 5th field 15 | adapter.set_field_types({0:'u4', 4:'u4'}) 16 | 17 | # Read every other record 18 | print(adapter[::2]) 19 | 20 | -------------------------------------------------------------------------------- /textadapter/examples/converter.py: -------------------------------------------------------------------------------- 1 | import textadapter 2 | 3 | adapter = textadapter.CSVTextAdapter('../tests/data/ints', delimiter=',', field_names=False) 4 | 5 | # Set dtype for each field in record 6 | adapter.set_field_types({0:'u4', 1:'u8', 2:'f4', 3:'f8', 4:'S10'}) 7 | 8 | # Override default converter for first field 9 | adapter.set_converter(0, lambda x: int(x)*2) 10 | 11 | # Read first 10 records 12 | print(adapter[:10]) 13 | 14 | -------------------------------------------------------------------------------- /textadapter/examples/fixed_width.py: -------------------------------------------------------------------------------- 1 | import textadapter 2 | 3 | adapter = textadapter.FixedWidthTextAdapter('../tests/data/fixedwidths', [2,3,4,5,6]) 4 | 5 | # Set dtype for each field in record 6 | adapter.set_field_types(dict(zip(range(5), ['u4']*5))) 7 | 8 | # Read all records 9 | print(adapter[:]) 10 | 11 | 12 | -------------------------------------------------------------------------------- /textadapter/examples/gzip_ints.py: -------------------------------------------------------------------------------- 1 | import textadapter 2 | 3 | adapter = textadapter.CSVTextAdapter('../tests/data/ints.gz', delimiter=',', compression='gzip', field_names=False) 4 | 5 | # Set dtype for each field in record 6 | adapter.set_field_types({0:'u4', 1:'u8', 2:'f4', 3:'f8', 4:'S10'}) 7 | 8 | print('\n\n!!! INVESTIGATE !!!\n\n') 9 | 10 | # adapter.size is unknown at this point... 11 | print('Before we read any records, try adapter.size...') 12 | try: 13 | sz = adapter.size 14 | except AttributeError as err: 15 | print('AttributeError:', err) 16 | 17 | # Read first record 18 | print('Read first record\n', adapter[0]) 19 | 20 | # But now adapter.size IS known! 21 | print('After we read a record...') 22 | print('adapter.size', adapter.size) 23 | 24 | # Read last record 25 | print('\n\nNow we attempt to read the LAST record...') 26 | print('adapter[-1] == adapter[0]?!? == ', adapter[-1]) 27 | print('adapter[1], as should be == ', adapter[1]) 28 | 29 | print('After we read ALL records...') 30 | records = adapter[:] 31 | print('adapter[-1] == ', adapter[-1]) 32 | 33 | print('\n\nFollowing code seems outdated. Remove it?') 34 | try: 35 | # build index of records and save index to file 36 | indexArray, gzipIndexArray = adapter.create_index() 37 | # load index from file 38 | adapter.set_index(indexArray, gzipIndexArray) 39 | except TypeError as err: 40 | raise TypeError(err) 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /textadapter/examples/missing_values.py: -------------------------------------------------------------------------------- 1 | import textadapter 2 | 3 | adapter = textadapter.CSVTextAdapter('../tests/data/missingvalues', delimiter=',', field_names=False) 4 | 5 | # Set dtype for each field in record 6 | adapter.set_field_types({0:'u4', 1:'u4', 2:'u4', 3:'u4', 4:'u4'}) 7 | 8 | # Define list of strings for each field that represent missing values 9 | adapter.set_missing_values({0:['NA', 'NaN'], 4:['xx','inf']}) 10 | 11 | # Set fill value for missing values in each field 12 | adapter.set_fill_values({0:99, 4:999}) 13 | 14 | # Read all records 15 | print(adapter[:]) 16 | 17 | 18 | -------------------------------------------------------------------------------- /textadapter/examples/regex.py: -------------------------------------------------------------------------------- 1 | import textadapter 2 | 3 | adapter = textadapter.RegexTextAdapter('../tests/data/ints', '([0-9]*),([0-9]*),([0-9]*),([0-9]*),([0-9]*)') 4 | 5 | # Set dtype for each group in regular expression. 6 | # Any groups without a dtype defined for it will not be 7 | # stored in numpy array 8 | adapter.set_field_types(dict(zip(range(5), ['u4']*5))) 9 | 10 | # Read all records 11 | print(adapter[:]) 12 | 13 | -------------------------------------------------------------------------------- /textadapter/lib/Converters.pyx: -------------------------------------------------------------------------------- 1 | import numpy 2 | cimport numpy 3 | 4 | 5 | cdef ConvertError str2str_object_converter(void *input_str, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg): 6 | """ 7 | Wrapper function for calling string object converter function 8 | from low level C api. This is used to convert c strings to python 9 | string objects. 10 | 11 | Arguments: 12 | void *input - pointer to value to convert 13 | uint32_t input_len - length in bytes of input value 14 | void *output - pointer to memory chunk to store converted value 15 | uint32_t output_len - size of output memory chunk 16 | void *arg - pointer to python callable object which does the actual converting 17 | 18 | Returns: 19 | converted value as a python string object 20 | """ 21 | cdef ConvertError result = CONVERT_ERROR_OBJECT_CONVERTER 22 | cdef PyObject **object_ptr 23 | object_ptr = output 24 | cdef kh_string_t *kh_string_table = arg 25 | cdef int ret 26 | cdef khiter_t it 27 | cdef object temp 28 | cdef char *input_str_copy 29 | 30 | try: 31 | # Convert c string to Python string object and store in output array 32 | if object_ptr != NULL: 33 | 34 | # string object hash table exists 35 | if kh_string_table != NULL: 36 | 37 | # Look for existing string object 38 | it = kh_get_string(kh_string_table, input_str) 39 | 40 | # String object doesn't exist, so create and store in output 41 | # array and hash table 42 | if it == kh_string_table.n_buckets: 43 | temp = (input_str)[0:input_len].decode(config['encoding']) 44 | object_ptr[0] = temp 45 | Py_INCREF(object_ptr[0]) 46 | input_str_copy = malloc(input_len+1) 47 | strncpy(input_str_copy, input_str, input_len+1) 48 | it = kh_put_string(kh_string_table, input_str_copy, &ret) 49 | kh_string_table.vals[it] = object_ptr[0] 50 | 51 | # String object exists, so store existing object in array 52 | else: 53 | object_ptr[0] = kh_string_table.vals[it] 54 | Py_INCREF(object_ptr[0]) 55 | 56 | # No string object hash table exists; just convert and store 57 | else: 58 | temp = (input_str)[0:input_len].decode(config['encoding']) 59 | object_ptr[0] = temp 60 | Py_INCREF(object_ptr[0]) 61 | 62 | # Try converting c string to Python string object (for type inference) 63 | else: 64 | temp = (input_str)[0:input_len].decode(config['encoding']) 65 | 66 | result = CONVERT_SUCCESS 67 | 68 | except Exception as e: 69 | result = CONVERT_ERROR_OBJECT_CONVERTER 70 | 71 | return result 72 | 73 | 74 | cdef ConvertError str2datetime_object_converter(void *input_str, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg): 75 | """ 76 | Wrapper function for calling string object converter function 77 | from low level C api. This is used to convert c strings to python 78 | string objects. 79 | 80 | Arguments: 81 | void *input - pointer to value to convert 82 | uint32_t input_len - length in bytes of input value 83 | void *output - pointer to memory chunk to store converted value 84 | uint32_t output_len - size of output memory chunk 85 | void *arg - pointer to python callable object which does the actual converting 86 | 87 | Returns: 88 | converted value as a python string object 89 | """ 90 | cdef ConvertError result = CONVERT_ERROR_OBJECT_CONVERTER 91 | cdef PyObject **object_ptr 92 | object_ptr = output 93 | cdef object temp 94 | 95 | try: 96 | if object_ptr != NULL: 97 | temp = str((input_str)[0:input_len].encode()) 98 | object_ptr[0] = temp 99 | Py_INCREF(object_ptr[0]) 100 | 101 | result = CONVERT_SUCCESS 102 | except Exception as e: 103 | result = CONVERT_ERROR_OBJECT_CONVERTER 104 | 105 | return result 106 | 107 | 108 | cdef ConvertError str2datetime_converter(void *input, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg): 109 | """ 110 | Wrapper function for calling numpy datetime converter function 111 | from low level C api. 112 | 113 | Arguments: 114 | void *input - pointer to value to convert 115 | uint32_t input_len - length in bytes of input value 116 | void *output - pointer to memory chunk to store converted value 117 | uint32_t output_len - size of output memory chunk 118 | void *arg - pointer to python callable object which does the actual converting 119 | 120 | Returns: 121 | Convert result 122 | """ 123 | cdef ConvertError result = CONVERT_ERROR_OBJECT_CONVERTER 124 | cdef numpy.npy_intp dims[1] 125 | cdef char *temp = input 126 | 127 | if arg == NULL: 128 | return CONVERT_ERROR_OBJECT_CONVERTER 129 | 130 | try: 131 | dtype = arg 132 | value = dtype.type(temp) 133 | if output != NULL: 134 | dims[0] = 1 135 | array = numpy.PyArray_SimpleNewFromData(1, dims, value.dtype.num, output) 136 | array.dtype = numpy.dtype(dtype) 137 | array[0] = value 138 | result = CONVERT_SUCCESS 139 | except Exception as e: 140 | result = CONVERT_ERROR_OBJECT_CONVERTER 141 | 142 | return result 143 | 144 | 145 | cdef ConvertError python_converter(void *input, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg): 146 | """ 147 | Wrapper function for calling python converter function from low level C api. 148 | 149 | Arguments: 150 | void *input - pointer to value to convert 151 | uint32_t input_len - length in bytes of input value 152 | void *output - pointer to memory chunk to store converted value 153 | uint32_t output_len - size of output memory chunk 154 | void *arg - pointer to python callable object which does the actual converting 155 | 156 | Returns: 157 | Convert result 158 | """ 159 | cdef numpy.npy_intp dims[1] 160 | cdef char *temp = calloc(1, input_len+1) 161 | cdef bytes py_string 162 | cdef ConvertError result = CONVERT_ERROR_USER_CONVERTER 163 | # "input" contains a long string (char*). We only copy "input_len" and make 164 | # sure that there is a null byte at the end (by using calloc with 165 | # input_len+1 above) 166 | memcpy(temp, input, input_len) 167 | 168 | try: 169 | # Convert "temp" to a Python string (bytes in fact) 170 | py_string = temp 171 | # Convert "arg" to Python callable: 172 | func = arg 173 | # call python callable object to convert input value 174 | new_value = func(py_string) 175 | 176 | if isinstance(new_value, numpy.generic): 177 | data = bytes(new_value.data) 178 | if output != NULL: 179 | memcpy(output, data, output_len) 180 | result = CONVERT_SUCCESS 181 | # JNB: not sure if there is a better way to store objects in numpy object array 182 | elif isinstance(new_value, object): 183 | if output != NULL: 184 | dims[0] = 1 185 | array = numpy.PyArray_SimpleNewFromData(1, dims, numpy.NPY_OBJECT, output) 186 | array[0] = new_value 187 | result = CONVERT_SUCCESS 188 | else: 189 | result = CONVERT_ERROR_USER_CONVERTER 190 | 191 | except: 192 | result = CONVERT_ERROR_USER_CONVERTER 193 | finally: 194 | free(temp) 195 | 196 | return result 197 | 198 | 199 | ctypedef uint64_t (*uint_numba_func_ptr)(char *) 200 | ctypedef int64_t (*int_numba_func_ptr)(char *) 201 | ctypedef double (*float_numba_func_ptr)(char *) 202 | ctypedef PyObject* (*object_numba_func_ptr)(char *) 203 | ctypedef int64_t (*datetime_numba_func_ptr)(char *) 204 | 205 | cdef ConvertError str2uint_numba_converter(void *input, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg): 206 | cdef uint_numba_func_ptr numba_func = arg 207 | cdef uint64_t *output_ptr64 = output 208 | cdef uint32_t *output_ptr32 = output 209 | cdef uint16_t *output_ptr16 = output 210 | cdef uint8_t *output_ptr8 = output 211 | cdef uint64_t value 212 | 213 | try: 214 | if output_len == 8: 215 | value = numba_func(input) 216 | if output != NULL: 217 | output_ptr64[0] = value 218 | elif output_len == 4: 219 | value = numba_func(input) 220 | if value > 0xffffffff: 221 | return CONVERT_ERROR_NUMBA 222 | if output != NULL: 223 | output_ptr32[0] = value 224 | elif output_len == 2: 225 | value = numba_func(input) 226 | if value > 0xffff: 227 | return CONVERT_ERROR_NUMBA 228 | if output != NULL: 229 | output_ptr16[0] = value 230 | elif output_len == 1: 231 | value = numba_func(input) 232 | if value > 0xff: 233 | return CONVERT_ERROR_NUMBA 234 | if output != NULL: 235 | output_ptr8[0] = value 236 | else: 237 | return CONVERT_ERROR_NUMBA 238 | except: 239 | return CONVERT_ERROR_NUMBA 240 | return CONVERT_SUCCESS 241 | 242 | cdef ConvertError str2float_numba_converter(void *input, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg): 243 | cdef float_numba_func_ptr numba_func = arg 244 | cdef float *output_ptr32 = output 245 | cdef double *output_ptr64 = output 246 | cdef double value 247 | 248 | try: 249 | if output_len == 4: 250 | value = numba_func(input) 251 | if output != NULL: 252 | output_ptr32[0] = value 253 | elif output_len == 8: 254 | value = numba_func(input) 255 | if output != NULL: 256 | output_ptr64[0] = value 257 | else: 258 | return CONVERT_ERROR_NUMBA 259 | except: 260 | return CONVERT_ERROR_NUMBA 261 | return CONVERT_SUCCESS 262 | 263 | cdef ConvertError str2datetime_numba_converter(void *input, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg): 264 | cdef datetime_numba_func_ptr numba_func = arg 265 | cdef int64_t *output_ptr = output 266 | cdef int64_t value 267 | 268 | try: 269 | if output_len == 8: 270 | value = numba_func(input) 271 | if output != NULL: 272 | output_ptr[0] = value 273 | else: 274 | return CONVERT_ERROR_NUMBA 275 | except: 276 | return CONVERT_ERROR_NUMBA 277 | return CONVERT_SUCCESS 278 | 279 | cdef ConvertError str2object_numba_converter(void *input, uint32_t input_len, int input_type, void *output, uint32_t output_len, void *arg): 280 | cdef object_numba_func_ptr numba_func = arg 281 | cdef PyObject **output_ptr = output 282 | cdef object value 283 | 284 | try: 285 | value = numba_func(input) 286 | if output != NULL: 287 | output_ptr[0] = value 288 | except: 289 | return CONVERT_ERROR_NUMBA 290 | 291 | return CONVERT_SUCCESS 292 | 293 | 294 | 295 | -------------------------------------------------------------------------------- /textadapter/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContinuumIO/TextAdapter/53138c2277cdfcf32e127251313d4f77f81050aa/textadapter/lib/__init__.py -------------------------------------------------------------------------------- /textadapter/lib/_stdint.h: -------------------------------------------------------------------------------- 1 | #ifndef STDINT_H 2 | #define STDINT_H 3 | 4 | 5 | #if defined(_MSC_VER) && _MSC_VER < 1600 6 | /* Visual Studio before 2010 didn't have stdint.h */ 7 | #include 8 | typedef signed char int8_t; 9 | typedef short int16_t; 10 | typedef int int32_t; 11 | typedef __int64 int64_t; 12 | typedef unsigned char uint8_t; 13 | typedef unsigned short uint16_t; 14 | typedef unsigned int uint32_t; 15 | typedef unsigned __int64 uint64_t; 16 | #define INT8_MIN SCHAR_MIN 17 | #define INT8_MAX SCHAR_MAX 18 | #define INT16_MIN SHRT_MIN 19 | #define INT16_MAX SHRT_MAX 20 | #define INT32_MIN INT_MIN 21 | #define INT32_MAX INT_MAX 22 | #define UINT8_MAX UCHAR_MAX 23 | #define UINT16_MAX USHRT_MAX 24 | #define UINT32_MAX UINT_MAX 25 | #define UINT64_MAX _UI64_MAX 26 | #else 27 | #include 28 | #endif 29 | 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /textadapter/lib/converter_functions.h: -------------------------------------------------------------------------------- 1 | #ifndef CONVERTERS_H 2 | #define CONVERTERS_H 3 | 4 | #if defined(_MSC_VER) && _MSC_VER < 1600 5 | /* Visual Studio before 2010 didn't have stdint.h */ 6 | typedef signed char int8_t; 7 | typedef short int16_t; 8 | typedef int int32_t; 9 | typedef __int64 int64_t; 10 | typedef unsigned char uint8_t; 11 | typedef unsigned short uint16_t; 12 | typedef unsigned int uint32_t; 13 | typedef unsigned __int64 uint64_t; 14 | #define INT8_MIN SCHAR_MIN 15 | #define INT8_MAX SCHAR_MAX 16 | #define INT16_MIN SHRT_MIN 17 | #define INT16_MAX SHRT_MAX 18 | #define INT32_MIN INT_MIN 19 | #define INT32_MAX INT_MAX 20 | #define UINT8_MAX UCHAR_MAX 21 | #define UINT16_MAX USHRT_MAX 22 | #define UINT32_MAX UINT_MAX 23 | #else 24 | #include 25 | #endif 26 | 27 | #include 28 | 29 | 30 | typedef enum 31 | { 32 | CONVERT_SUCCESS, 33 | CONVERT_SUCCESS_TYPE_CHANGED, 34 | CONVERT_ERROR, 35 | CONVERT_ERROR_OVERFLOW, 36 | CONVERT_ERROR_TRUNCATE, 37 | CONVERT_ERROR_INPUT_TYPE, 38 | CONVERT_ERROR_INPUT_SIZE, 39 | CONVERT_ERROR_OUTPUT_SIZE, 40 | CONVERT_ERROR_INPUT_STRING, 41 | CONVERT_ERROR_USER_CONVERTER, 42 | CONVERT_ERROR_OBJECT_CONVERTER, 43 | CONVERT_ERROR_NUMBA, 44 | CONVERT_ERROR_LAST 45 | } ConvertError; 46 | 47 | 48 | typedef enum 49 | { 50 | UINT_CONVERTER_FUNC, 51 | INT_CONVERTER_FUNC, 52 | FLOAT_CONVERTER_FUNC, 53 | STRING_CONVERTER_FUNC, 54 | STRING_OBJECT_CONVERTER_FUNC, 55 | NUM_CONVERTER_FUNCS 56 | } DefaultConverterFuncs; 57 | 58 | 59 | /* 60 | * converter function signature for functions that convert strings to a specific 61 | * data type and stores in output buffer 62 | * Inputs: 63 | * input: null terminated C string representing value to convert 64 | * input_len: length of input (redundant but input string originally was not 65 | * null terminated 66 | * input_type: indicates type of input (not used by every converter func) 67 | * output: pointer to memory block where output value should be stored 68 | * output_len: length of output reserved for output value 69 | * arg: optional arg value/struct specific to each converter func 70 | * Output: 71 | * error code defined above in ConvertError enum 72 | */ 73 | typedef ConvertError (*converter_func_ptr)(const char *input, 74 | uint32_t input_len, 75 | int input_type, 76 | void *output, 77 | uint32_t output_len, 78 | void *arg); 79 | 80 | /* 81 | * The following conversion functions follow conversion function signature 82 | * defined above 83 | */ 84 | 85 | /* Convert null terminated C string to signed int */ 86 | ConvertError str2int_converter(const char *input, uint32_t input_len, 87 | int input_type, void *output, uint32_t output_len, void *arg); 88 | /* Convert null terminated C string to unsigned int */ 89 | ConvertError str2uint_converter(const char *input, uint32_t input_len, 90 | int input_type, void *output, uint32_t output_len, void *arg); 91 | /* Convert null terminated C string to float/double */ 92 | ConvertError str2float_converter(const char *input, uint32_t input_len, 93 | int input_type, void *output, uint32_t output_len, void *arg); 94 | /* Copy null terminated C string to output of possibly different length */ 95 | ConvertError str2str_converter(void *input, uint32_t input_len, 96 | int input_type, void *output, uint32_t output_len, void *arg); 97 | /* Convert null terminated C string to complex number */ 98 | ConvertError str2complex_converter(void *input, uint32_t input_len, 99 | int input_type, void *output, uint32_t output_len, void *arg); 100 | 101 | 102 | /* 103 | * Extract signed int of various sizes from memory block and cast to 104 | * signed int64 if needed. Input integer size is specified by input_len argument. 105 | */ 106 | ConvertError get_int_value(void *input, uint32_t input_len, int64_t *value); 107 | 108 | /* 109 | * Extract unsigned int of various sizes from memory block and cast to 110 | * unsigned int64 if needed. Input integer size is specified by input_len argument. 111 | */ 112 | ConvertError get_uint_value(void *input, uint32_t input_len, uint64_t *value); 113 | 114 | /* 115 | * Extract double/float from from memory block and cast to 116 | * double if needed. Input floating point size is specified by input_len argument. 117 | */ 118 | ConvertError get_float_value(void *input, uint32_t input_len, double *value); 119 | 120 | /* 121 | * Save signed int64 value to memory block, casting to appropriate output integer 122 | * size if needed. Output integer size is specified by output_len arg. 123 | */ 124 | ConvertError put_int_value(void *output, uint32_t output_len, int64_t value); 125 | 126 | /* 127 | * Save unsigned int64 value to memory block, casting to appropriate output integer 128 | * size if needed. Output integer size is specified by output_len arg. 129 | */ 130 | ConvertError put_uint_value(void *output, uint32_t output_len, uint64_t value); 131 | 132 | /* 133 | * Save double/float value to memory block, casting to appropriate output floating 134 | * point size if needed. Output float size is specified by output_len arg. 135 | */ 136 | ConvertError put_float_value(void *output, uint32_t output_len, double value); 137 | 138 | #endif 139 | -------------------------------------------------------------------------------- /textadapter/lib/errors.py: -------------------------------------------------------------------------------- 1 | class AdapterException(Exception): 2 | """Generic adapter exception for reporting reading, parsing, and 3 | converting issues. All adapter exceptions have following instance 4 | variables in common: 5 | 6 | * `record` - record reference where error occured 7 | * `field` - field reference where error occured 8 | """ 9 | def __init__(self, message=None): 10 | super(AdapterException, self).__init__(message) 11 | 12 | self.record = None 13 | self.field = None 14 | 15 | class SourceError(AdapterException): 16 | """Raised on error while reading or talking to a data source. It might be 17 | seek or read error for file sources or broken connection for database 18 | sources.""" 19 | pass 20 | 21 | class SourceNotFoundError(SourceError): 22 | """Raised when data source (file, table, ...) was not found.""" 23 | def __init__(self, message=None, source=None): 24 | super(SourceNotFoundError, self).__init__(message) 25 | self.source = source 26 | 27 | class ConfigurationError(AdapterException): 28 | """Raised when objects are mis-configured.""" 29 | pass 30 | 31 | class NoSuchFieldError(AdapterException): 32 | """Raised when non-existent field is referenced, either by name or position index.""" 33 | pass 34 | 35 | class DataIndexError(AdapterException): 36 | """Raised for example when a record is not found in record index in indexed 37 | data source.""" 38 | pass 39 | 40 | class DataTypeError(AdapterException): 41 | """Raised on data type mis-match or when type conversion fails.""" 42 | pass 43 | 44 | class ParserError(AdapterException): 45 | """Raised when there is problem with parsing source data, for example in 46 | broken text file with CSV. The `token` instance variable contains problematic 47 | token that was not parsed correctly.""" 48 | def __init__(self, message=None, token=None): 49 | super(ParserError, self).__init__(message) 50 | self.token = token 51 | 52 | class ArgumentError(AdapterException): 53 | """Invalid arguments used in calling textadapter functions/methods""" 54 | pass 55 | 56 | class InternalInconsistencyError(AdapterException): 57 | """Raised when the library goes into a state that is not expected to 58 | happen.""" 59 | pass 60 | 61 | class AdapterIndexError(AdapterException): 62 | """ Raised when record number or slice is invalid """ 63 | pass 64 | 65 | -------------------------------------------------------------------------------- /textadapter/lib/field_info.c: -------------------------------------------------------------------------------- 1 | #include "field_info.h" 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | /* Set the number of fields in input data. This */ 8 | void set_num_fields(FieldList *fields, uint32_t num_fields) 9 | { 10 | uint32_t i; 11 | 12 | #ifdef DEBUG_ADAPTER 13 | printf("set_num_fields() setting number of fields to %u\n", num_fields); 14 | #endif 15 | 16 | if (fields == NULL) 17 | return; 18 | 19 | if (fields->field_info != NULL) 20 | { 21 | clear_fields(fields); 22 | } 23 | 24 | if (num_fields > 0) 25 | fields->field_info = (FieldInfo*)calloc(num_fields, sizeof(FieldInfo)); 26 | 27 | fields->num_fields = num_fields; 28 | 29 | for (i = 0; i < num_fields; i++) 30 | { 31 | fields->field_info[i].infer_type = 1; 32 | } 33 | } 34 | 35 | /* Initialize infer_type flag in each field to 1 */ 36 | void init_infer_types(FieldList *fields) 37 | { 38 | uint32_t i; 39 | for(i = 0; i < fields->num_fields; i++) 40 | { 41 | fields->field_info[i].infer_type = 1; 42 | } 43 | } 44 | 45 | /* Initialize missing value struct */ 46 | void init_missing_values(FieldList *fields, char *field_name, 47 | uint32_t field_num, uint32_t num_missing_values) 48 | { 49 | MissingValues *missing_values; 50 | 51 | if (fields == NULL) 52 | return; 53 | 54 | if (field_num >= fields->num_fields) 55 | return; 56 | 57 | missing_values = &fields->field_info[field_num].missing_values; 58 | 59 | clear_missing_values(missing_values); 60 | 61 | missing_values->num_missing_values = num_missing_values; 62 | missing_values->missing_value_lens = 63 | calloc(num_missing_values, sizeof(uint32_t)); 64 | missing_values->missing_values = 65 | calloc(num_missing_values, sizeof(char *)); 66 | } 67 | 68 | 69 | /* Add missing value string for the specified field */ 70 | void add_missing_value(FieldList *fields, char *field_name, 71 | uint32_t field_num, char *missing_value, uint32_t missing_value_len) 72 | { 73 | MissingValues *missing_values; 74 | uint32_t i; 75 | 76 | if (fields == NULL) 77 | return; 78 | 79 | if (field_num >= fields->num_fields) 80 | return; 81 | 82 | missing_values = &fields->field_info[field_num].missing_values; 83 | 84 | /* Find first empty entry in missing values array to store missing 85 | value string */ 86 | i = 0; 87 | while (i < missing_values->num_missing_values && 88 | missing_values->missing_values[i] > 0) 89 | { 90 | i++; 91 | } 92 | 93 | missing_values->missing_values[i] = 94 | calloc(missing_value_len + 1, sizeof(char)); 95 | strncpy(missing_values->missing_values[i], missing_value, missing_value_len); 96 | missing_values->missing_value_lens[i] = missing_value_len; 97 | } 98 | 99 | 100 | /* Set pointer to fill value for specified field. Positive valeu for 101 | 'loose' argument enables fill value to be used when token for this 102 | field cannot be converted. */ 103 | void set_fill_value(FieldList *fields, char *field_name, 104 | uint32_t field_num, void *new_fill_value, uint32_t fill_value_len, int loose) 105 | { 106 | FillValue *fill_value; 107 | 108 | if (fields == NULL) 109 | return; 110 | 111 | if (field_num >= fields->num_fields) 112 | return; 113 | 114 | fill_value = &fields->field_info[field_num].fill_value; 115 | 116 | if (new_fill_value == NULL) 117 | { 118 | clear_fill_value(fill_value); 119 | } 120 | else 121 | { 122 | fill_value->fill_value = calloc(1, fill_value_len); 123 | memcpy(fill_value->fill_value, new_fill_value, fill_value_len); 124 | fill_value->loose = loose; 125 | } 126 | } 127 | 128 | 129 | uint32_t get_field_size(FieldList *fields, char *field_name, uint32_t field_num) 130 | { 131 | uint32_t i; 132 | 133 | if (fields == NULL) 134 | return 0; 135 | 136 | if (field_name != NULL) 137 | { 138 | i = 0; 139 | while (i < fields->num_fields) 140 | { 141 | if (strcpy(fields->field_info[i].name, field_name)) 142 | { 143 | return fields->field_info[i].output_field_size; 144 | } 145 | i++; 146 | } 147 | 148 | return 0; 149 | } 150 | else 151 | { 152 | return fields->field_info[field_num].output_field_size; 153 | } 154 | } 155 | 156 | 157 | uint32_t get_output_record_size(FieldList *fields) 158 | { 159 | uint32_t i; 160 | uint32_t rec_size; 161 | 162 | if (fields == NULL) 163 | return 0; 164 | 165 | rec_size = 0; 166 | 167 | for (i = 0; i < fields->num_fields; i++) 168 | { 169 | if (fields->field_info[i].converter != NULL) 170 | { 171 | rec_size += fields->field_info[i].output_field_size; 172 | } 173 | } 174 | 175 | return rec_size; 176 | } 177 | 178 | 179 | 180 | /* Deallocate missing value strings */ 181 | void clear_missing_values(MissingValues *missing_values) 182 | { 183 | uint32_t i; 184 | 185 | assert(missing_values != NULL); 186 | 187 | if (missing_values->missing_values != NULL) 188 | { 189 | for (i = 0; i < missing_values->num_missing_values; i++) 190 | { 191 | if (missing_values->missing_values[i] != NULL) 192 | free(missing_values->missing_values[i]); 193 | } 194 | 195 | free(missing_values->missing_values); 196 | missing_values->missing_values = NULL; 197 | } 198 | 199 | if (missing_values->missing_value_lens != NULL) 200 | { 201 | free(missing_values->missing_value_lens); 202 | missing_values->missing_value_lens = NULL; 203 | } 204 | 205 | missing_values->num_missing_values = 0; 206 | } 207 | 208 | 209 | /* Deallocate pointer to fill value for specified field */ 210 | void clear_fill_value(FillValue *fill_value) 211 | { 212 | assert(fill_value != NULL); 213 | 214 | if (fill_value->fill_value != NULL) 215 | { 216 | free(fill_value->fill_value); 217 | fill_value->fill_value = NULL; 218 | } 219 | } 220 | 221 | 222 | void clear_fields(FieldList *fields) 223 | { 224 | uint32_t i; 225 | 226 | for (i = 0; i < fields->num_fields; i++) 227 | { 228 | if (fields->field_info[i].name != NULL) 229 | { 230 | free(fields->field_info[i].name); 231 | } 232 | fields->field_info[i].name = NULL; 233 | 234 | fields->field_info[i].converter = NULL; 235 | fields->field_info[i].converter_arg = NULL; 236 | 237 | clear_missing_values(&fields->field_info[i].missing_values); 238 | clear_fill_value(&fields->field_info[i].fill_value); 239 | 240 | fields->field_info[i].output_field_size = 0; 241 | fields->field_info[i].input_field_width = 0; 242 | } 243 | 244 | free(fields->field_info); 245 | } 246 | 247 | 248 | /* Set fixed field width for specified field */ 249 | void set_field_width(FieldList *fields, uint32_t field, uint32_t width) 250 | { 251 | if (fields == NULL) 252 | return; 253 | 254 | if (field >= fields->num_fields) 255 | return; 256 | 257 | fields->field_info[field].input_field_width = width; 258 | } 259 | 260 | 261 | void reset_converters(FieldList *fields) 262 | { 263 | uint32_t field; 264 | 265 | if (fields == NULL) 266 | return; 267 | 268 | for (field = 0; field < fields->num_fields; field++) 269 | { 270 | fields->field_info[field].converter = NULL; 271 | fields->field_info[field].converter_arg = NULL; 272 | } 273 | } 274 | 275 | 276 | void set_converter(FieldList *fields, uint32_t field_num, char *field_name, 277 | uint32_t output_field_size, converter_func_ptr converter, 278 | void *converter_arg) 279 | { 280 | if (fields == NULL) 281 | return; 282 | 283 | if (field_num >= fields->num_fields) 284 | return; 285 | 286 | //if (field_name == NULL) 287 | // return; 288 | 289 | if (fields->field_info[field_num].name != NULL) 290 | { 291 | free(fields->field_info[field_num].name); 292 | } 293 | 294 | if (field_name != NULL) 295 | { 296 | fields->field_info[field_num].name = 297 | calloc(strlen(field_name), sizeof(char)); 298 | strncpy(fields->field_info[field_num].name, field_name, strlen(field_name)); 299 | } 300 | else 301 | { 302 | fields->field_info[field_num].name = NULL; 303 | } 304 | 305 | fields->field_info[field_num].converter = converter; 306 | fields->field_info[field_num].converter_arg = converter_arg; 307 | fields->field_info[field_num].output_field_size = output_field_size; 308 | } 309 | 310 | 311 | int infer_types(FieldList *fields) 312 | { 313 | uint32_t i; 314 | 315 | for (i = 0; i < fields->num_fields; i++) 316 | { 317 | if (fields->field_info[i].infer_type == 1) 318 | return 1; 319 | } 320 | 321 | return 0; 322 | } 323 | -------------------------------------------------------------------------------- /textadapter/lib/field_info.h: -------------------------------------------------------------------------------- 1 | #ifndef FIELD_INFO_H 2 | #define FIELD_INFO_H 3 | 4 | #include "converter_functions.h" 5 | 6 | 7 | typedef struct missing_values_t 8 | { 9 | char **missing_values; 10 | uint32_t *missing_value_lens; 11 | uint32_t num_missing_values; 12 | } MissingValues; 13 | 14 | 15 | typedef struct fill_value_t 16 | { 17 | void *fill_value; 18 | int loose; 19 | } FillValue; 20 | 21 | 22 | typedef struct field_info_t 23 | { 24 | char *name; 25 | 26 | /* converter function to convert data to target data type */ 27 | converter_func_ptr converter; 28 | void *converter_arg; 29 | 30 | MissingValues missing_values; 31 | 32 | FillValue fill_value; 33 | 34 | /* field width for fixed width data */ 35 | uint32_t input_field_width; 36 | 37 | /* field size in output array */ 38 | uint32_t output_field_size; 39 | 40 | /* flag allows user to fix the type. default, though, is to infer_type */ 41 | int infer_type; 42 | 43 | } FieldInfo; 44 | 45 | 46 | typedef struct field_list_t 47 | { 48 | uint32_t num_fields; 49 | FieldInfo *field_info; 50 | } FieldList; 51 | 52 | 53 | void clear_fields(FieldList *fields); 54 | void set_num_fields(FieldList *fields, uint32_t num_fields); 55 | 56 | void clear_missing_values(MissingValues *missing_values); 57 | void clear_fill_value(FillValue *fill_value); 58 | 59 | void init_missing_values(FieldList *fields, char *field_name, 60 | uint32_t field_num, uint32_t num_missing_values); 61 | 62 | void add_missing_value(FieldList *fields, char *field_name, 63 | uint32_t field_num, char *missing_value, uint32_t missing_value_len); 64 | 65 | void set_fill_value(FieldList *fields, char *field_name, 66 | uint32_t field_num, void *fill_value, uint32_t fill_value_len, int loose); 67 | 68 | uint32_t get_field_size(FieldList *fields, char *field_name, 69 | uint32_t field_num); 70 | uint32_t get_output_record_size(FieldList *fields); 71 | 72 | void set_field_width(FieldList *fields, uint32_t field, uint32_t width); 73 | 74 | /* Resets converter function pointers to null */ 75 | void reset_converters(FieldList *fields); 76 | 77 | /* Sets converter function for specified field with specified field size. 78 | * converter_arg will be passed to converter function when called. */ 79 | void set_converter(FieldList *fields, uint32_t field_num, char *field_name, 80 | uint32_t output_field_size, converter_func_ptr converter, 81 | void *converter_arg); 82 | 83 | /* Initialize the type of each of the fields to be inferred */ 84 | void init_infer_types(FieldList *fields); 85 | 86 | int infer_types(FieldList *fields); 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /textadapter/lib/kstring.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "_stdint.h" 6 | #include "kstring.h" 7 | 8 | #ifdef _MSC_VER 9 | #define va_copy(d,s) ((d)=(s)) 10 | #endif 11 | 12 | int kvsprintf(kstring_t *s, const char *fmt, va_list ap) 13 | { 14 | va_list args; 15 | int l; 16 | va_copy(args, ap); 17 | l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'. 18 | va_end(args); 19 | #ifdef _MSC_VER 20 | if (l == -1) { 21 | va_copy(args, ap); 22 | l = _vscprintf(fmt, args); 23 | va_end(args); 24 | #else 25 | if (l + 1 > s->m - s->l) { 26 | #endif 27 | s->m = s->l + l + 2; 28 | kroundup32(s->m); 29 | s->s = (char*)realloc(s->s, s->m); 30 | va_copy(args, ap); 31 | l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); 32 | va_end(args); 33 | } 34 | s->l += l; 35 | return l; 36 | } 37 | 38 | int ksprintf(kstring_t *s, const char *fmt, ...) 39 | { 40 | va_list ap; 41 | int l; 42 | va_start(ap, fmt); 43 | l = kvsprintf(s, fmt, ap); 44 | va_end(ap); 45 | return l; 46 | } 47 | 48 | char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux) 49 | { 50 | const char *p, *start; 51 | if (sep) { // set up the table 52 | if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished 53 | aux->finished = 0; 54 | if (sep[1]) { 55 | aux->sep = -1; 56 | aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; 57 | for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f); 58 | } else aux->sep = sep[0]; 59 | } 60 | if (aux->finished) return 0; 61 | else if (str) aux->p = str - 1, aux->finished = 0; 62 | if (aux->sep < 0) { 63 | for (p = start = aux->p + 1; *p; ++p) 64 | if (aux->tab[*p>>6]>>(*p&0x3f)&1) break; 65 | } else { 66 | for (p = start = aux->p + 1; *p; ++p) 67 | if (*p == aux->sep) break; 68 | } 69 | aux->p = p; // end of token 70 | if (*p == 0) aux->finished = 1; // no more tokens 71 | return (char*)start; 72 | } 73 | 74 | // s MUST BE a null terminated string; l = strlen(s) 75 | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) 76 | { 77 | int i, n, max, last_char, last_start, *offsets, l; 78 | n = 0; max = *_max; offsets = *_offsets; 79 | l = strlen(s); 80 | 81 | #define __ksplit_aux do { \ 82 | if (_offsets) { \ 83 | s[i] = 0; \ 84 | if (n == max) { \ 85 | int *tmp; \ 86 | max = max? max<<1 : 2; \ 87 | if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) { \ 88 | offsets = tmp; \ 89 | } else { \ 90 | free(offsets); \ 91 | *_offsets = NULL; \ 92 | return 0; \ 93 | } \ 94 | } \ 95 | offsets[n++] = last_start; \ 96 | } else ++n; \ 97 | } while (0) 98 | 99 | for (i = 0, last_char = last_start = 0; i <= l; ++i) { 100 | if (delimiter == 0) { 101 | if (isspace(s[i]) || s[i] == 0) { 102 | if (isgraph(last_char)) __ksplit_aux; // the end of a field 103 | } else { 104 | if (isspace(last_char) || last_char == 0) last_start = i; 105 | } 106 | } else { 107 | if (s[i] == delimiter || s[i] == 0) { 108 | if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field 109 | } else { 110 | if (last_char == delimiter || last_char == 0) last_start = i; 111 | } 112 | } 113 | last_char = s[i]; 114 | } 115 | *_max = max; *_offsets = offsets; 116 | return n; 117 | } 118 | 119 | int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp) 120 | { 121 | size_t l0 = s->l; 122 | 123 | while (s->l == l0 || s->s[s->l-1] != '\n') { 124 | if (s->m - s->l < 200) ks_resize(s, s->m + 200); 125 | if (fgets_fn(s->s + s->l, s->m - s->l, fp) == NULL) break; 126 | s->l += strlen(s->s + s->l); 127 | } 128 | 129 | if (s->l == l0) return EOF; 130 | 131 | if (s->l > l0 && s->s[s->l-1] == '\n') { 132 | s->l--; 133 | if (s->l > l0 && s->s[s->l-1] == '\r') s->l--; 134 | } 135 | s->s[s->l] = '\0'; 136 | return 0; 137 | } 138 | 139 | /********************** 140 | * Boyer-Moore search * 141 | **********************/ 142 | 143 | typedef unsigned char ubyte_t; 144 | 145 | // reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html 146 | static int *ksBM_prep(const ubyte_t *pat, int m) 147 | { 148 | int i, *suff, *prep, *bmGs, *bmBc; 149 | prep = (int*)calloc(m + 256, sizeof(int)); 150 | bmGs = prep; bmBc = prep + m; 151 | { // preBmBc() 152 | for (i = 0; i < 256; ++i) bmBc[i] = m; 153 | for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; 154 | } 155 | suff = (int*)calloc(m, sizeof(int)); 156 | { // suffixes() 157 | int f = 0, g; 158 | suff[m - 1] = m; 159 | g = m - 1; 160 | for (i = m - 2; i >= 0; --i) { 161 | if (i > g && suff[i + m - 1 - f] < i - g) 162 | suff[i] = suff[i + m - 1 - f]; 163 | else { 164 | if (i < g) g = i; 165 | f = i; 166 | while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; 167 | suff[i] = f - g; 168 | } 169 | } 170 | } 171 | { // preBmGs() 172 | int j = 0; 173 | for (i = 0; i < m; ++i) bmGs[i] = m; 174 | for (i = m - 1; i >= 0; --i) 175 | if (suff[i] == i + 1) 176 | for (; j < m - 1 - i; ++j) 177 | if (bmGs[j] == m) 178 | bmGs[j] = m - 1 - i; 179 | for (i = 0; i <= m - 2; ++i) 180 | bmGs[m - 1 - suff[i]] = m - 1 - i; 181 | } 182 | free(suff); 183 | return prep; 184 | } 185 | 186 | void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep) 187 | { 188 | int i, j, *prep = 0, *bmGs, *bmBc; 189 | const ubyte_t *str, *pat; 190 | str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat; 191 | prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep; 192 | if (_prep && *_prep == 0) *_prep = prep; 193 | bmGs = prep; bmBc = prep + m; 194 | j = 0; 195 | while (j <= n - m) { 196 | for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); 197 | if (i >= 0) { 198 | int max = bmBc[str[i+j]] - m + 1 + i; 199 | if (max < bmGs[i]) max = bmGs[i]; 200 | j += max; 201 | } else return (void*)(str + j); 202 | } 203 | if (_prep == 0) free(prep); 204 | return 0; 205 | } 206 | 207 | char *kstrstr(const char *str, const char *pat, int **_prep) 208 | { 209 | return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep); 210 | } 211 | 212 | char *kstrnstr(const char *str, const char *pat, int n, int **_prep) 213 | { 214 | return (char*)kmemmem(str, n, pat, strlen(pat), _prep); 215 | } 216 | 217 | /*********************** 218 | * The main() function * 219 | ***********************/ 220 | 221 | #ifdef KSTRING_MAIN 222 | #include 223 | int main() 224 | { 225 | kstring_t *s; 226 | int *fields, n, i; 227 | ks_tokaux_t aux; 228 | char *p; 229 | s = (kstring_t*)calloc(1, sizeof(kstring_t)); 230 | // test ksprintf() 231 | ksprintf(s, " abcdefg: %d ", 100); 232 | printf("'%s'\n", s->s); 233 | // test ksplit() 234 | fields = ksplit(s, 0, &n); 235 | for (i = 0; i < n; ++i) 236 | printf("field[%d] = '%s'\n", i, s->s + fields[i]); 237 | // test kstrtok() 238 | s->l = 0; 239 | for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) { 240 | kputsn(p, aux.p - p, s); 241 | kputc('\n', s); 242 | } 243 | printf("%s", s->s); 244 | // free 245 | free(s->s); free(s); free(fields); 246 | 247 | { 248 | static char *str = "abcdefgcdgcagtcakcdcd"; 249 | static char *pat = "cd"; 250 | char *ret, *s = str; 251 | int *prep = 0; 252 | while ((ret = kstrstr(s, pat, &prep)) != 0) { 253 | printf("match: %s\n", ret); 254 | s = ret + prep[0]; 255 | } 256 | free(prep); 257 | } 258 | return 0; 259 | } 260 | #endif 261 | -------------------------------------------------------------------------------- /textadapter/lib/kstring.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) by Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #ifndef KSTRING_H 27 | #define KSTRING_H 28 | 29 | #include 30 | #include 31 | #include 32 | #include "_stdint.h" 33 | #include 34 | 35 | #ifdef _MSC_VER 36 | #define inline __inline 37 | #endif 38 | 39 | #ifndef kroundup32 40 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 41 | #endif 42 | 43 | #if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4) 44 | #define KS_ATTR_PRINTF(fmt, arg) __attribute__((__format__ (__printf__, fmt, arg))) 45 | #else 46 | #define KS_ATTR_PRINTF(fmt, arg) 47 | #endif 48 | 49 | 50 | /* kstring_t is a simple non-opaque type whose fields are likely to be 51 | * used directly by user code (but see also ks_str() and ks_len() below). 52 | * A kstring_t object is initialised by either of 53 | * kstring_t str = { 0, 0, NULL }; 54 | * kstring_t str; ...; str.l = str.m = 0; str.s = NULL; 55 | * and either ownership of the underlying buffer should be given away before 56 | * the object disappears (see ks_release() below) or the kstring_t should be 57 | * destroyed with free(str.s); */ 58 | #ifndef KSTRING_T 59 | #define KSTRING_T kstring_t 60 | typedef struct __kstring_t { 61 | size_t l, m; 62 | char *s; 63 | } kstring_t; 64 | #endif 65 | 66 | typedef struct { 67 | uint64_t tab[4]; 68 | int sep, finished; 69 | const char *p; // end of the current token 70 | } ks_tokaux_t; 71 | 72 | #ifdef __cplusplus 73 | extern "C" { 74 | #endif 75 | 76 | int kvsprintf(kstring_t *s, const char *fmt, va_list ap) KS_ATTR_PRINTF(2,0); 77 | int ksprintf(kstring_t *s, const char *fmt, ...) KS_ATTR_PRINTF(2,3); 78 | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); 79 | char *kstrstr(const char *str, const char *pat, int **_prep); 80 | char *kstrnstr(const char *str, const char *pat, int n, int **_prep); 81 | void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); 82 | 83 | /* kstrtok() is similar to strtok_r() except that str is not 84 | * modified and both str and sep can be NULL. For efficiency, it is 85 | * actually recommended to set both to NULL in the subsequent calls 86 | * if sep is not changed. */ 87 | char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); 88 | 89 | /* kgetline() uses the supplied fgets()-like function to read a "\n"- 90 | * or "\r\n"-terminated line from fp. The line read is appended to the 91 | * kstring without its terminator and 0 is returned; EOF is returned at 92 | * EOF or on error (determined by querying fp, as per fgets()). */ 93 | typedef char *kgets_func(char *, int, void *); 94 | int kgetline(kstring_t *s, kgets_func *fgets, void *fp); 95 | 96 | #ifdef __cplusplus 97 | } 98 | #endif 99 | 100 | static inline int ks_resize(kstring_t *s, size_t size) 101 | { 102 | if (s->m < size) { 103 | char *tmp; 104 | s->m = size; 105 | kroundup32(s->m); 106 | if ((tmp = (char*)realloc(s->s, s->m))) 107 | s->s = tmp; 108 | else 109 | return -1; 110 | } 111 | return 0; 112 | } 113 | 114 | static inline char *ks_str(kstring_t *s) 115 | { 116 | return s->s; 117 | } 118 | 119 | static inline size_t ks_len(kstring_t *s) 120 | { 121 | return s->l; 122 | } 123 | 124 | // Give ownership of the underlying buffer away to something else (making 125 | // that something else responsible for freeing it), leaving the kstring_t 126 | // empty and ready to be used again, or ready to go out of scope without 127 | // needing free(str.s) to prevent a memory leak. 128 | static inline char *ks_release(kstring_t *s) 129 | { 130 | char *ss = s->s; 131 | s->l = s->m = 0; 132 | s->s = NULL; 133 | return ss; 134 | } 135 | 136 | static inline int kputsn(const char *p, int l, kstring_t *s) 137 | { 138 | if (s->l + l + 1 >= s->m) { 139 | char *tmp; 140 | s->m = s->l + l + 2; 141 | kroundup32(s->m); 142 | if ((tmp = (char*)realloc(s->s, s->m))) 143 | s->s = tmp; 144 | else 145 | return EOF; 146 | } 147 | memcpy(s->s + s->l, p, l); 148 | s->l += l; 149 | s->s[s->l] = 0; 150 | return l; 151 | } 152 | 153 | static inline int kputs(const char *p, kstring_t *s) 154 | { 155 | return kputsn(p, strlen(p), s); 156 | } 157 | 158 | static inline int kputc(int c, kstring_t *s) 159 | { 160 | if (s->l + 1 >= s->m) { 161 | char *tmp; 162 | s->m = s->l + 2; 163 | kroundup32(s->m); 164 | if ((tmp = (char*)realloc(s->s, s->m))) 165 | s->s = tmp; 166 | else 167 | return EOF; 168 | } 169 | s->s[s->l++] = c; 170 | s->s[s->l] = 0; 171 | return c; 172 | } 173 | 174 | static inline int kputc_(int c, kstring_t *s) 175 | { 176 | if (s->l + 1 > s->m) { 177 | char *tmp; 178 | s->m = s->l + 1; 179 | kroundup32(s->m); 180 | if ((tmp = (char*)realloc(s->s, s->m))) 181 | s->s = tmp; 182 | else 183 | return EOF; 184 | } 185 | s->s[s->l++] = c; 186 | return 1; 187 | } 188 | 189 | static inline int kputsn_(const void *p, int l, kstring_t *s) 190 | { 191 | if (s->l + l > s->m) { 192 | char *tmp; 193 | s->m = s->l + l; 194 | kroundup32(s->m); 195 | if ((tmp = (char*)realloc(s->s, s->m))) 196 | s->s = tmp; 197 | else 198 | return EOF; 199 | } 200 | memcpy(s->s + s->l, p, l); 201 | s->l += l; 202 | return l; 203 | } 204 | 205 | static inline int kputw(int c, kstring_t *s) 206 | { 207 | char buf[16]; 208 | int i, l = 0; 209 | unsigned int x = c; 210 | if (c < 0) x = -x; 211 | do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0); 212 | if (c < 0) buf[l++] = '-'; 213 | if (s->l + l + 1 >= s->m) { 214 | char *tmp; 215 | s->m = s->l + l + 2; 216 | kroundup32(s->m); 217 | if ((tmp = (char*)realloc(s->s, s->m))) 218 | s->s = tmp; 219 | else 220 | return EOF; 221 | } 222 | for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; 223 | s->s[s->l] = 0; 224 | return 0; 225 | } 226 | 227 | static inline int kputuw(unsigned c, kstring_t *s) 228 | { 229 | char buf[16]; 230 | int l, i; 231 | unsigned x; 232 | if (c == 0) return kputc('0', s); 233 | for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; 234 | if (s->l + l + 1 >= s->m) { 235 | char *tmp; 236 | s->m = s->l + l + 2; 237 | kroundup32(s->m); 238 | if ((tmp = (char*)realloc(s->s, s->m))) 239 | s->s = tmp; 240 | else 241 | return EOF; 242 | } 243 | for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; 244 | s->s[s->l] = 0; 245 | return 0; 246 | } 247 | 248 | static inline int kputl(long c, kstring_t *s) 249 | { 250 | char buf[32]; 251 | int i, l = 0; 252 | unsigned long x = c; 253 | if (c < 0) x = -x; 254 | do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0); 255 | if (c < 0) buf[l++] = '-'; 256 | if (s->l + l + 1 >= s->m) { 257 | char *tmp; 258 | s->m = s->l + l + 2; 259 | kroundup32(s->m); 260 | if ((tmp = (char*)realloc(s->s, s->m))) 261 | s->s = tmp; 262 | else 263 | return EOF; 264 | } 265 | for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; 266 | s->s[s->l] = 0; 267 | return 0; 268 | } 269 | 270 | /* 271 | * Returns 's' split by delimiter, with *n being the number of components; 272 | * NULL on failue. 273 | */ 274 | static inline int *ksplit(kstring_t *s, int delimiter, int *n) 275 | { 276 | int max = 0, *offsets = 0; 277 | *n = ksplit_core(s->s, delimiter, &max, &offsets); 278 | return offsets; 279 | } 280 | 281 | #endif 282 | -------------------------------------------------------------------------------- /textadapter/lib/kvec.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, by Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* 27 | An example: 28 | 29 | #include "kvec.h" 30 | int main() { 31 | kvec_t(int) array; 32 | kv_init(array); 33 | kv_push(int, array, 10); // append 34 | kv_a(int, array, 20) = 5; // dynamic 35 | kv_A(array, 20) = 4; // static 36 | kv_destroy(array); 37 | return 0; 38 | } 39 | */ 40 | 41 | /* 42 | 2008-09-22 (0.1.0): 43 | 44 | * The initial version. 45 | 46 | */ 47 | 48 | #ifndef AC_KVEC_H 49 | #define AC_KVEC_H 50 | 51 | #include 52 | 53 | #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 54 | 55 | #define kvec_t(type) struct { size_t n, m; type *a; } 56 | #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) 57 | #define kv_destroy(v) free((v).a) 58 | #define kv_A(v, i) ((v).a[(i)]) 59 | #define kv_pop(v) ((v).a[--(v).n]) 60 | #define kv_size(v) ((v).n) 61 | #define kv_max(v) ((v).m) 62 | 63 | #define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m)) 64 | 65 | #define kv_copy(type, v1, v0) do { \ 66 | if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ 67 | (v1).n = (v0).n; \ 68 | memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ 69 | } while (0) \ 70 | 71 | #define kv_push(type, v, x) do { \ 72 | if ((v).n == (v).m) { \ 73 | (v).m = (v).m? (v).m<<1 : 2; \ 74 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ 75 | } \ 76 | (v).a[(v).n++] = (x); \ 77 | } while (0) 78 | 79 | #define kv_pushp(type, v) (((v).n == (v).m)? \ 80 | ((v).m = ((v).m? (v).m<<1 : 2), \ 81 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ 82 | : 0), ((v).a + ((v).n++)) 83 | 84 | #define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ 85 | ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ 86 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ 87 | : (v).n <= (size_t)(i)? (v).n = (i) \ 88 | : 0), (v).a[(i)] 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /textadapter/tests/Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | CFLAGS = -g -Werror -Wall -Wdeclaration-after-statement 3 | TEXT_INCLUDE_DIRS = -I ../textadapter -I ../lib 4 | TEXT_LIBS = -lz -lpcre 5 | TEXT_OBJS = test_text_adapter.o text_adapter.o converter_functions.o index.o 6 | 7 | 8 | test_text_adapter: $(TEXT_OBJS) 9 | $(CC) $(CFLAGS) $(TEXT_OBJS) -o test_text_adapter $(TEXT_LIBS) 10 | 11 | test_text_adapter.o: test_text_adapter.c 12 | $(CC) $(CFLAGS) -c test_text_adapter.c $(TEXT_INCLUDE_DIRS) 13 | 14 | text_adapter.o: ../textadapter/text_adapter.c 15 | $(CC) $(CFLAGS) -c ../textadapter/text_adapter.c $(TEXT_INCLUDE_DIRS) 16 | 17 | converter_functions.o: ../lib/converter_functions.c 18 | $(CC) $(CFLAGS) -c ../lib/converter_functions.c $(TEXT_INCLUDE_DIRS) 19 | 20 | index.o: ../textadapter/index.c 21 | $(CC) $(CFLAGS) -c ../textadapter/index.c $(TEXT_INCLUDE_DIRS) 22 | 23 | field_info.o: ../lib/field_info.c 24 | $(CC) $(CFLAGS) -c ../lib/field_info.c $(TEXT_INCLUDE_DIRS) 25 | 26 | clean: 27 | -rm test_text_adapter 28 | -rm -f *.o 29 | 30 | -------------------------------------------------------------------------------- /textadapter/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContinuumIO/TextAdapter/53138c2277cdfcf32e127251313d4f77f81050aa/textadapter/tests/__init__.py -------------------------------------------------------------------------------- /textadapter/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | def pytest_addoption(parser): 4 | parser.addoption('--pg_host', action='store') 5 | parser.addoption('--pg_dbname', action='store') 6 | parser.addoption('--pg_user', action='store') 7 | parser.addoption('--acc_host', action='store') 8 | parser.addoption('--acc_user', action='store') 9 | parser.addoption('--acc_password', action='store') 10 | -------------------------------------------------------------------------------- /textadapter/tests/data/benchmarks.py: -------------------------------------------------------------------------------- 1 | import timeit 2 | import os 3 | 4 | 5 | def timeFunction(function, setup): 6 | print 'timing', function 7 | t = timeit.Timer(stmt=function, setup=setup) 8 | times = [] 9 | for i in range(0,3): 10 | os.system('sudo sh -c "sync; echo 3 > /proc/sys/vm/drop_caches"') 11 | times.append(str(t.timeit(number=1))) 12 | return min(times) 13 | 14 | 15 | ints1 = timeFunction('blazeopt.loadtxt("ints1", dtype="u4,u4,u4,u4,u4", delimiter=",")', 'import blazeopt') 16 | ints2 = timeFunction('blazeopt.loadtxt("ints2", dtype="u4,u4,u4,u4,u4", delimiter=",")', 'import blazeopt') 17 | ints3 = timeFunction('blazeopt.loadtxt("ints3", dtype="u4,u4,u4,u4,u4", delimiter=",")', 'import blazeopt') 18 | print ints1, ints2, ints3 19 | 20 | floats1 = timeFunction('blazeopt.loadtxt("floats1", dtype="f8,f8,f8,f8,f8", delimiter=",")', 'import blazeopt') 21 | floats2 = timeFunction('blazeopt.loadtxt("floats2", dtype="f8,f8,f8,f8,f8", delimiter=",")', 'import blazeopt') 22 | floats3 = timeFunction('blazeopt.loadtxt("floats3", dtype="f8,f8,f8,f8,f8", delimiter=",")', 'import blazeopt') 23 | print floats1, floats2, floats3 24 | 25 | ints1 = timeFunction('blazeopt.genfromtxt("ints1", dtype="u4,u4,u4,u4,u4", delimiter=",")', 'import blazeopt') 26 | ints2 = timeFunction('blazeopt.genfromtxt("ints2", dtype="u4,u4,u4,u4,u4", delimiter=",")', 'import blazeopt') 27 | ints3 = timeFunction('blazeopt.genfromtxt("ints3", dtype="u4,u4,u4,u4,u4", delimiter=",")', 'import blazeopt') 28 | print ints1, ints2, ints3 29 | 30 | floats1 = timeFunction('blazeopt.genfromtxt("floats1", dtype="f8,f8,f8,f8,f8", delimiter=",")', 'import blazeopt') 31 | floats2 = timeFunction('blazeopt.genfromtxt("floats2", dtype="f8,f8,f8,f8,f8", delimiter=",")', 'import blazeopt') 32 | floats3 = timeFunction('blazeopt.genfromtxt("floats3", dtype="f8,f8,f8,f8,f8", delimiter=",")', 'import blazeopt') 33 | print floats1, floats2, floats3 34 | 35 | missingValues1 = timeFunction('blazeopt.genfromtxt("missingvalues1", dtype="u4,u4,u4,u4,u4", delimiter=",", missing_values={0:["NA","NaN"], 1:["xx","inf"]}, filling_values="999")', 'import blazeopt') 36 | missingValues2 = timeFunction('blazeopt.genfromtxt("missingvalues2", dtype="u4,u4,u4,u4,u4", delimiter=",", missing_values={0:["NA","NaN"], 1:["xx","inf"]}, filling_values="999")', 'import blazeopt') 37 | missingValues3 = timeFunction('blazeopt.genfromtxt("missingvalues3", dtype="u4,u4,u4,u4,u4", delimiter=",", missing_values={0:["NA","NaN"], 1:["xx","inf"]}, filling_values="999")', 'import blazeopt') 38 | print missingValues1, missingValues2, missingValues3 39 | 40 | fixedwidth1 = timeFunction('blazeopt.genfromtxt("fixedwidth1", dtype="u4,u4,u4,u4,u4", delimiter=[2,3,4,5,6])', 'import blazeopt') 41 | fixedwidth2 = timeFunction('blazeopt.genfromtxt("fixedwidth2", dtype="u4,u4,u4,u4,u4", delimiter=[2,3,4,5,6])', 'import blazeopt') 42 | fixedwidth3 = timeFunction('blazeopt.genfromtxt("fixedwidth3", dtype="u4,u4,u4,u4,u4", delimiter=[2,3,4,5,6])', 'import blazeopt') 43 | print fixedwidth1, fixedwidth2, fixedwidth3 44 | 45 | -------------------------------------------------------------------------------- /textadapter/tests/generate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import time 4 | import gzip 5 | import numpy 6 | 7 | 8 | def generate_dataset(output, valueIter, delimiter, num_recs): 9 | for i in range(0, num_recs): 10 | line = '' 11 | for j in range(0, 5): 12 | if j == 5 - 1: 13 | line += str(valueIter.next()) 14 | else: 15 | line += str(valueIter.next()) + delimiter 16 | output.write(line) 17 | output.write('\n') 18 | output.seek(0) 19 | 20 | 21 | class IntIter(object): 22 | 23 | def __init__(self): 24 | self.value = 0 25 | 26 | def __str__(self): 27 | return 'ints' 28 | 29 | def __iter__(self): 30 | return self 31 | 32 | def next(self): 33 | nextValue = self.value 34 | self.value = self.value + 1 35 | return nextValue 36 | 37 | 38 | class SignedIntIter(object): 39 | 40 | def __init__(self): 41 | self.value = -1 42 | 43 | def __str__(self): 44 | return 'signed int' 45 | 46 | def __iter__(self): 47 | return self 48 | 49 | def next(self): 50 | nextValue = self.value 51 | if self.value < 0: 52 | self.value = self.value - 1 53 | else: 54 | self.value = self.value + 1 55 | self.value *= -1 56 | return nextValue 57 | 58 | 59 | class FloatIter(object): 60 | 61 | def __init__(self): 62 | self.value = 0.0 63 | 64 | def __str__(self): 65 | return 'floats' 66 | 67 | def __iter__(self): 68 | return self 69 | 70 | def next(self): 71 | nextValue = self.value 72 | self.value = self.value + 0.1 73 | return nextValue 74 | 75 | 76 | class MissingValuesIter(object): 77 | 78 | def __init__(self): 79 | self.value = 0 80 | 81 | def __str__(self): 82 | return 'missing values' 83 | 84 | def __iter__(self): 85 | return self 86 | 87 | def next(self): 88 | nextValue = self.value 89 | if nextValue % 20 == 0: 90 | nextValue = 'NA' 91 | elif nextValue % 20 == 4: 92 | nextValue = 'xx' 93 | elif nextValue % 20 == 5: 94 | nextValue = 'NaN' 95 | elif nextValue % 20 == 9: 96 | nextValue = 'inf' 97 | self.value = self.value + 1 98 | return nextValue 99 | 100 | 101 | class FixedWidthIter(object): 102 | 103 | def __init__(self): 104 | self.field = 0 105 | self.fieldValues = ['00','000','0000','00000','000000'] 106 | 107 | def __str__(self): 108 | return 'fixed widths' 109 | 110 | def __iter__(self): 111 | return self 112 | 113 | def next(self): 114 | nextValue = self.fieldValues[self.field] 115 | 116 | self.field = self.field + 1 117 | if self.field == 5: 118 | self.field = 0 119 | self.fieldValues[0] = str((int(self.fieldValues[0]) + 1) % 100).zfill(2) 120 | self.fieldValues[1] = str((int(self.fieldValues[1]) + 1) % 1000).zfill(3) 121 | self.fieldValues[2] = str((int(self.fieldValues[2]) + 1) % 10000).zfill(4) 122 | self.fieldValues[3] = str((int(self.fieldValues[3]) + 1) % 100000).zfill(5) 123 | self.fieldValues[4] = str((int(self.fieldValues[4]) + 1) % 1000000).zfill(6) 124 | 125 | return nextValue 126 | 127 | 128 | class QuoteIter(object): 129 | 130 | def __init__(self): 131 | self.value = 0 132 | 133 | def __str__(self): 134 | return 'quoted strings' 135 | 136 | def __iter__(self): 137 | return self 138 | 139 | def next(self): 140 | nextValue = self.value 141 | characters = list(str(nextValue)) 142 | nextValue = '"' + ',\n'.join(characters) + '"' 143 | 144 | self.value = self.value + 1 145 | return nextValue 146 | 147 | 148 | class DateTimeIter(object): 149 | 150 | def __init__(self): 151 | self.value = 0 152 | 153 | def __str__(self): 154 | return 'datetime' 155 | 156 | def __iter__(self): 157 | return self 158 | 159 | def next(self): 160 | nextValue = self.value 161 | self.value = self.value + 1 162 | return numpy.datetime64(nextValue, 'D') 163 | 164 | 165 | if __name__ == "__main__": 166 | import sys 167 | if len(sys.argv) != 2: 168 | sys.exit("Please define number of records in datasets: ") 169 | 170 | numRecords = int(sys.argv[1]) 171 | 172 | output = open('./data/ints', 'w') 173 | generate_dataset(output, IntIter(), ',', numRecords) 174 | output.close() 175 | 176 | output = open('./data/floats', 'w') 177 | generate_dataset(output, FloatIter(), ',', numRecords) 178 | output.close() 179 | 180 | output = open('./data/missingvalues', 'w') 181 | generate_dataset(output, MissingValuesIter(), ',', numRecords) 182 | output.close() 183 | 184 | output = open('./data/fixedwidths', 'w') 185 | generate_dataset(output, FixedWidthIter(), '', numRecords) 186 | output.close() 187 | 188 | input = open('./data/ints', 'rb') 189 | output = gzip.open('./data/ints.gz', 'wb') 190 | output.writelines(input) 191 | output.close() 192 | input.close 193 | 194 | '''generate_dataset('ints2', IntIter(), ',', 12500000) 195 | generate_dataset('ints3', IntIter(), ',', 25000000) 196 | generate_dataset('signedints1', SignedIntIter(), ',', 2500000) 197 | generate_dataset('floats1', FloatIter(), ',', 1500000) 198 | generate_dataset('floats2', FloatIter(), ',', 7500000) 199 | generate_dataset('floats3', FloatIter(), ',', 15000000) 200 | generate_dataset('missingvalues1', MissingValuesIter(), ',', 3000000) 201 | generate_dataset('missingvalues2', MissingValuesIter(), ',', 15000000) 202 | generate_dataset('missingvalues3', MissingValuesIter(), ',', 30000000) 203 | generate_dataset('fixedwidth1', FixedWidthIter(), '', 5000000) 204 | generate_dataset('fixedwidth2', FixedWidthIter(), '', 25000000) 205 | generate_dataset('fixedwidth3', FixedWidthIter(), '', 50000000) 206 | generate_dataset('ints_spacedelim', IntIter(), ' ', 2500000) 207 | generate_dataset('quotes', QuoteIter(), ' ', 2500000) 208 | generate_dataset('datetime', DateTimeIter(), ',', 2500000)''' 209 | 210 | -------------------------------------------------------------------------------- /textadapter/tests/test_ints.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "../textadapter/text_adapter.h" 7 | #include "../textadapter/io_functions.h" 8 | 9 | 10 | int main() 11 | { 12 | uint64_t num_fields = 5; 13 | 14 | FILE *input = fopen("./data/ints", "r"); 15 | setvbuf(input, NULL, _IONBF, 0); 16 | 17 | TextAdapter *adapter = open_text_adapter((void *)input, NULL, &read_file, NULL, &seek_file, NULL); 18 | adapter->tokenize = &delim_tokenizer; 19 | set_num_fields(adapter, num_fields); 20 | adapter->delim_char = ','; 21 | adapter->quote_char = '\0'; 22 | adapter->comment_char = '\0'; 23 | 24 | int c; 25 | for (c = 0; c < num_fields; c++) 26 | { 27 | set_converter(adapter, c, sizeof(uint32_t), &uint_converter, NULL); 28 | } 29 | 30 | uint32_t *data = calloc(10000000, sizeof(uint32_t)*num_fields); 31 | 32 | fseek(input, 0, SEEK_SET); 33 | 34 | clock_t t0 = clock(); 35 | uint64_t recs_read = 0; 36 | int result = read_records(adapter, 10000000, 1, (char *)data, &recs_read); 37 | clock_t t1 = clock(); 38 | 39 | assert(result == ADAPTER_SUCCESS); 40 | 41 | printf("PASSED: read %llu records in %.2lf seconds\n", recs_read, (double)(t1-t0) / (double)CLOCKS_PER_SEC); 42 | 43 | free(data); 44 | close_text_adapter(adapter); 45 | } 46 | --------------------------------------------------------------------------------