├── .gitignore ├── LICENSE ├── README.md ├── run ├── setup.py ├── src └── lazycsv │ ├── __init__.py │ └── lazycsv.c ├── tests ├── benchmark_lazy.py ├── fixtures │ ├── file.csv │ ├── file_crlf.csv │ ├── file_crlf2.csv │ ├── file_delimiter_and_quotechar.csv │ ├── file_empty.csv │ └── file_newline.csv ├── script_lazycsv.py └── test_lazycsv.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | venv/ 4 | .pytest_cache/ 5 | .tox/ 6 | .local/ 7 | __pycache__/ 8 | tests/fixtures/benchmarks/* 9 | 10 | *.so 11 | *.egg-info/ 12 | *.python-version 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Crunch.io 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # lazycsv - a memory-efficient csv parser 2 | 3 | ###### Developers: Michael Green, Chris Perkins 4 | 5 | lazycsv is a C implementation of a csv parser for python. The aim of this 6 | parser is to provide for fast extraction of sequences of data from a CSV file 7 | in a memory-efficient manner, with zero dependencies. 8 | 9 | LazyCSV utilizes memory mapped files and iterators to parse the file without 10 | persisting any significant amounts of data to physical memory. The design 11 | allows a user to generate PyObject's from a csv file lazily. 12 | 13 | The parser works as follows: 14 | 15 | First, The user file is memory-mapped internally to the LazyCSV object. That 16 | file is used to generate three indexes. The first is an index of values which 17 | correspond to the position in the user file where a given CSV field starts. 18 | This value is always a `uint16_t` which we found to be the optimal bit size for 19 | disk usage and execution performance (This type can however be changed by 20 | setting the `LAZYCSV_INDEX_DTYPE` environment variable to any unsigned integer 21 | type). For index values outside the range of an unsigned short, An "anchor 22 | point" is created, which is a pair of `size_t` values that mark both the value 23 | which is subtracted from the index value such that the index value fits within 24 | 16 bits, and the first column of the CSV where the anchor value applies. This 25 | anchor point is periodically written to the second index file when required for 26 | a given comma index. Finally, the third index writes the index of the first 27 | anchor point for each row of the file. 28 | 29 | When a user requests a sequence of data (i.e. a row or a column), an iterator 30 | is created and returned. This iterator uses the value of the requested sequence 31 | and its internal position state to index into the index files the values 32 | representing the index of the requested field, and its length. Those two values 33 | are then used to create a single PyBytes object. These PyBytes objects are then 34 | yielded to the user per-iteration. 35 | 36 | This process is lazy, only yielding data from the user file as the iterator is 37 | consumed. It does not cache results as they are generated - it is the 38 | responsibility of the user to store in physical memory the data which must be 39 | persisted. The only persisted overhead in physical memory is the LazyCSV object 40 | itself, any created iterators, a small cache of common length-0 and length-1 41 | `PyObject*`'s for fast returns, and optionally the headers of the CSV file. 42 | 43 | ```python 44 | >>> from lazycsv import lazycsv 45 | >>> lazy = lazycsv.LazyCSV("tests/fixtures/file.csv") 46 | >>> lazy 47 | 48 | >>> (col := lazy.sequence(col=0)) 49 | 50 | >>> next(col) 51 | b'0' 52 | >>> next(col) 53 | b'1' 54 | >>> next(col) 55 | Traceback (most recent call last): 56 | File "", line 1, in 57 | StopIteration 58 | ``` 59 | 60 | Since data is yielded through the iterator protocol, lazycsv pairs well with 61 | many of the builtin functional components of Python, and third-party libraries 62 | with support for iterators. This has the added benefit of keeping iterations in 63 | the C level, maximizing performance. 64 | 65 | ```python 66 | >>> row = lazy.sequence(row=1) 67 | >>> list(map(lambda x: x.decode('utf8'), row)) 68 | ['1', 'a1', 'b1'] 69 | >>> 70 | >>> import numpy as np 71 | >>> np.fromiter(map(int, lazy.sequence(col=0)), dtype=np.int64) 72 | array([0, 1]) 73 | ``` 74 | 75 | The `lazy` object also supports indexing operations for expressive iterables. 76 | The axis for iteration can be passed as a slice object, and the index of the 77 | iterable can be passed as a integer. Individual coordinate values can also be 78 | passed as a pair of integers, this call will eagerly return the value at that 79 | index. 80 | 81 | ```python 82 | >>> list(lazy[::-1, 1]) 83 | [b'a1', b'a0'] 84 | >>> lazy[-1, -1] 85 | b"b1" 86 | ``` 87 | 88 | Iterators can be materialized at any point by calling the `to_list()` or 89 | `to_numpy()` methods on the iterator object (to enable optional numpy support, 90 | see the Numpy section of this document). These methods exhaust the iterator, 91 | placing the remaining PyBytes values into a PyObject. 92 | 93 | ```python 94 | >>> col = lazy[:, 0] 95 | >>> next(col) 96 | b'0' 97 | >>> col.to_list() 98 | [b'1'] 99 | >>> 100 | ``` 101 | 102 | Headers are by default parsed from the csv file and packaged into a tuple under 103 | a `.headers` attribute. This can be skipped by passing `skip_headers=True` to 104 | the object constructor. Skipping the header parsing step results in the header 105 | value being included in the iterator. 106 | 107 | *Note: `lazycsv` makes no effort to deduplicate headers and it is the 108 | responsibility of the user to make sure that columns are properly named.* 109 | 110 | ```python 111 | >>> lazy.headers 112 | (b'', b'ALPHA', b'BETA') 113 | >>> (col := lazy.sequence(col=1)) 114 | 115 | >>> list(col) 116 | [b'a0', b'a1'] 117 | >>> lazy = lazycsv.LazyCSV(FPATH, skip_headers=True) 118 | >>> (col := lazy[:, 1]) 119 | 120 | >>> list(col) 121 | [b'ALPHA', b'a0', b'a1'] 122 | ``` 123 | 124 | Fields which are double-quoted by default are yielded without quotes. This 125 | behavior can be disabled by passing `unquoted=False` to the object constructor. 126 | 127 | ```python 128 | >>> lazy = lazycsv.LazyCSV( 129 | ... "tests/fixtures/file_crlf2.csv" 130 | ... ) 131 | >>> lazy.headers 132 | (b'', b'This,that', b'Fizz,Buzz') 133 | >>> lazy = lazycsv.LazyCSV( 134 | ... "tests/fixtures/file_crlf2.csv", unquote=False 135 | ... ) 136 | >>> lazy.headers 137 | (b'', b'"This,that"', b'"Fizz,Buzz"') 138 | ``` 139 | 140 | LazyCSV also provides the option to specify a delimiter and a quote character. 141 | Pass the keywords `delimiter=` and `quotechar=` to the object contstructor to 142 | use custom values. By default, `delimiter` defaults to `,` and `quotechar` 143 | defaults to `"`. 144 | 145 | ```python 146 | >>> lazy = lazycsv.LazyCSV( 147 | ... "tests/fixtures/file_delimiter_and_quotechar.csv", 148 | ... quotechar="|", 149 | ... delimiter="\t", 150 | ... unquote=False, 151 | ... ) 152 | ... 153 | >>> open(lazy.name, "rb").read() 154 | b'INDEX\tATTR\n0\t|A|\n1\t|B|\n' 155 | >>> list(lazy[:, 1]) 156 | [b'|A|', b'|B|'] 157 | ``` 158 | 159 | ### Numpy 160 | 161 | Optional, opt-in numpy support is built into the module. Access to this 162 | extended feature set can be had by building the extension from source while 163 | setting a `LAZYCSV_INCLUDE_NUMPY` environment variable to `1`. This adds a 164 | `to_numpy()` method to the iterator, which allows iterators to materialize in a 165 | 1-dimensional numpy array without creating intermediary PyObject*'s for each 166 | field of the CSV file. 167 | 168 | Access to this feature requires numpy to be preinstalled as this feature makes 169 | numpy a compilation dependency. 170 | 171 | ```bash 172 | $ LAZYCSV_INCLUDE_NUMPY=1 python -m pip install lazycsv 173 | ``` 174 | ```python 175 | >>> import numpy as np 176 | >>> from lazycsv import lazycsv 177 | >>> lazy = lazycsv.LazyCSV("") 178 | >>> lazy = lazycsv.LazyCSV("./tests/fixtures/file.csv") 179 | >>> lazy.sequence(col=0).to_numpy().astype(np.int8) 180 | array([0, 1], dtype=int8) 181 | ``` 182 | 183 | Users pinned to an older version of numpy (<1.7) may wish to instead compile 184 | using a `LAZYCSV_INCLUDE_NUMPY_LEGACY=1` flag, which drops the API pin in the 185 | module while still compiling with numpy support. 186 | 187 | #### Benchmarks (CPU) 188 | 189 | CPU benchmarks are included below, benchmarked on a Ryzen 7 5800X inside a 190 | stock python3.9 docker container. 191 | 192 | ``` 193 | root@aa9d7c7ffb59:/code# python tests/benchmark_lazy.py 194 | filesize: 0.134gb 195 | cols=10000 196 | rows=10000 197 | sparsity=0.95 198 | 199 | benchmarking lazycsv: 200 | indexing lazy... time to index: 0.450414217018988 201 | parsing cols... time to parse: 1.5233540059998631 202 | total time: 1.9737682230188511 203 | 204 | benchmarking datatable: 205 | 100% |██████████████████████████████████████████████████| Reading data [done] 206 | creating datatables frame... time to object: 0.40828132900060154 207 | parsing cols... time to parse: 3.810204313998838 208 | total time: 4.21848564299944 209 | 210 | benchmarking polars (read): 211 | creating polars df... time to object: 2.357821761001105 212 | parsing cols... time to parse: 1.3874979300017003 213 | total time: 3.7453196910028055 214 | ``` 215 | 216 | ``` 217 | root@aa9d7c7ffb59:/code# python tests/benchmark_lazy.py 218 | filesize: 1.387gb 219 | cols=10000 220 | rows=100000 221 | sparsity=0.95 222 | 223 | benchmarking lazycsv: 224 | indexing lazy... time to index: 4.298127760004718 225 | parsing cols... time to parse: 18.591125406033825 226 | total time: 22.889253166038543 227 | 228 | benchmarking datatable: 229 | 100% |██████████████████████████████████████████████████| Reading data [done] 230 | creating datatables frame... time to object: 2.4456441220027045 231 | parsing cols... time to parse: 37.424315700998704 232 | total time: 39.86995982300141 233 | 234 | benchmarking polars (read): 235 | creating polars df... time to object: 22.383294907001982 236 | parsing cols... time to parse: 14.16580996599805 237 | total time: 36.54910487300003 238 | ``` 239 | 240 | ``` 241 | filesize: 14.333gb 242 | cols=100000 243 | rows=100000 244 | sparsity=0.95 245 | 246 | benchmarking lazycsv: 247 | indexing lazy... time to index: 55.42112316700002 248 | parsing cols... time to parse: 362.268973717 249 | total time: 417.690096884 250 | 251 | benchmarking datatable: 252 | 58% |█████████████████████████████▍ | Reading data Killed 253 | 254 | benchmarking polars (read): 255 | Killed 256 | ``` 257 | -------------------------------------------------------------------------------- /run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | versions="3.5 3.6 3.7 3.8 3.9 3.10 3.11 3.12 3.13" 4 | 5 | function run_version_tests { 6 | if [ "--build" = $2 ] 7 | then 8 | args="--inplace --force" 9 | LAZYCSV_INCLUDE_NUMPY=1 LAZYCSV_INDEX_DTYPE=uint8_t \ 10 | python setup.py build_ext $args &> /dev/null 11 | fi 12 | python -m pytest 13 | } 14 | 15 | function run_benchmarks { 16 | python ./tests/benchmark_lazy.py 17 | } 18 | 19 | function print_help { 20 | echo "bash commands:" 21 | echo "- bench: run benchmarks" 22 | echo "- test: run test suite" 23 | echo "- testrunner: spin up docker container for testing purposes" 24 | echo "- tox: run tox" 25 | } 26 | 27 | function run_testrunner { 28 | if [ -z $(command -v docker) ] 29 | then 30 | echo "environment tests requires docker executable" 31 | exit 1 32 | fi 33 | 34 | container=lazycsv_testrunner 35 | if [[ -z $(docker ps -a --format {{.Names}} | grep $container) ]] 36 | then 37 | docker run \ 38 | -v $(pwd):/code \ 39 | --name $container \ 40 | -e LAZYCSV_INCLUDE_NUMPY=1 \ 41 | -e PYENV_ROOT="/root/.pyenv" \ 42 | -e PATH="/root/.pyenv/shims:/root/.pyenv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" \ 43 | -dit debian:bookworm 44 | docker exec $container bash \ 45 | -c "apt-get update && apt-get install git wget tar make gcc build-essential gdb lcov pkg-config libbz2-dev libffi-dev libgdbm-dev libgdbm-compat-dev liblzma-dev libncurses5-dev libreadline6-dev libsqlite3-dev libssl-dev lzma lzma-dev tk-dev uuid-dev zlib1g-dev -y" 46 | docker exec $container bash \ 47 | -c "cd /root && git clone --depth=1 https://github.com/pyenv/pyenv.git .pyenv" 48 | docker exec $container bash -c "pyenv install $versions" 49 | docker exec $container bash -c "pyenv local $versions" 50 | for version in $versions 51 | do 52 | docker exec $container bash \ 53 | -c "python$version -m pip install numpy pytest && python$version -m pip install -e /code" 54 | done 55 | docker exec $container bash \ 56 | -c "python3.11 -m pip install tox" 57 | else 58 | docker start $container > /dev/null 59 | fi 60 | docker exec -it $container bash 61 | } 62 | 63 | function run_tox { 64 | python3.11 -m tox 65 | } 66 | 67 | function run_debug { 68 | LAZYCSV_INCLUDE_NUMPY=1 \ 69 | LAZYCSV_INDEX_DTYPE="uint8_t" \ 70 | LAZYCSV_DEBUG=1 \ 71 | CFLAGS="-O0" \ 72 | python setup.py build_ext --inplace --force \ 73 | && gdb --args python -m pytest ${@:2} 74 | } 75 | 76 | case $1 in 77 | testrunner) run_testrunner $@ ;; 78 | test) run_version_tests $@ ;; 79 | bench) run_benchmarks $@ ;; 80 | debug) run_debug $@ ;; 81 | tox) run_tox $@ ;; 82 | *) print_help $@ ;; 83 | esac 84 | 85 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import Extension, find_packages, setup 4 | 5 | LAZYCSV_DEBUG = int("LAZYCSV_DEBUG" in os.environ) 6 | LAZYCSV_INDEX_DTYPE = os.environ.get("LAZYCSV_INDEX_DTYPE", "uint16_t") 7 | 8 | LAZYCSV_INCLUDE_NUMPY = int("LAZYCSV_INCLUDE_NUMPY" in os.environ) 9 | LAZYCSV_INCLUDE_NUMPY_LEGACY = int("LAZYCSV_INCLUDE_NUMPY_LEGACY" in os.environ) 10 | 11 | include_dirs = ( 12 | [__import__("numpy").get_include()] 13 | if (LAZYCSV_INCLUDE_NUMPY | LAZYCSV_INCLUDE_NUMPY_LEGACY) 14 | else [] 15 | ) 16 | 17 | if not LAZYCSV_INDEX_DTYPE.startswith(("unsigned", "uint")): 18 | raise ValueError("specified LAZYCSV_INDEX_DTYPE must be an unsigned integer type") 19 | 20 | extensions = [ 21 | Extension( 22 | "lazycsv.lazycsv", 23 | [os.path.join("src", "lazycsv", "lazycsv.c")], 24 | include_dirs=include_dirs, 25 | define_macros=[ 26 | ("INDEX_DTYPE", LAZYCSV_INDEX_DTYPE), 27 | ("INCLUDE_NUMPY", LAZYCSV_INCLUDE_NUMPY), 28 | ("INCLUDE_NUMPY_LEGACY", LAZYCSV_INCLUDE_NUMPY_LEGACY), 29 | ("DEBUG", LAZYCSV_DEBUG), 30 | ], 31 | ) 32 | ] 33 | 34 | with open("README.md", "r", encoding="utf-8") as f: 35 | long_description = f.read() 36 | 37 | setup( 38 | name="lazycsv", 39 | version="1.1.7", 40 | author="Michael Green, Chris Perkins", 41 | author_email="dev@crunch.io", 42 | description="an fast, memory efficient csv parser", 43 | long_description=long_description, 44 | long_description_content_type="text/markdown", 45 | packages=find_packages(where="src"), 46 | extras_require={ 47 | "test": ["pytest", "numpy"], 48 | "benchmark": ["datatable", "pandas", "pyarrow", "polars"], 49 | }, 50 | classifiers=[ 51 | "Development Status :: 5 - Production/Stable", 52 | "Intended Audience :: Developers", 53 | "License :: OSI Approved :: MIT License", 54 | "Natural Language :: English", 55 | "Operating System :: POSIX", 56 | "Operating System :: POSIX :: Linux", 57 | "Programming Language :: Python :: 3", 58 | "Programming Language :: Python :: 3.5", 59 | "Programming Language :: Python :: 3.6", 60 | "Programming Language :: Python :: 3.7", 61 | "Programming Language :: Python :: 3.8", 62 | "Programming Language :: Python :: 3.9", 63 | "Programming Language :: Python :: 3.10", 64 | "Programming Language :: Python :: 3.11", 65 | "Programming Language :: Python :: 3.12", 66 | "Programming Language :: Python :: 3.13", 67 | "Programming Language :: Python :: Implementation :: CPython", 68 | "Topic :: Utilities", 69 | ], 70 | package_dir={"": "src"}, 71 | ext_modules=extensions, 72 | ) 73 | -------------------------------------------------------------------------------- /src/lazycsv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Crunch-io/lazycsv/7949c63c0b681f270b72cca36a213f4af73bd64d/src/lazycsv/__init__.py -------------------------------------------------------------------------------- /src/lazycsv/lazycsv.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include "structmember.h" 13 | 14 | #define LINE_FEED 10 15 | #define CARRIAGE_RETURN 13 16 | 17 | // users can set this macro using the env variable LAZYCSV_INDEX_DTYPE if you 18 | // want to be more aggressive with minimizing index disk usage (i.e. define 19 | // INDEX_DTYPE as uint8_t) but at a cost to performance. 20 | 21 | #ifndef INDEX_DTYPE 22 | #define INDEX_DTYPE uint16_t 23 | #endif 24 | 25 | #ifdef DEBUG 26 | void PyDebug() {return;} 27 | #endif 28 | 29 | // optionally include a to_numpy() method on the iterable to materialize into a 30 | // numpy array, requires numpy install and to be set explicitly using env 31 | // variable LAZYCSV_INCLUDE_NUMPY=1, and LAZYCSV_INCLUDE_NUMPY_LEGACY=1 to 32 | // install using legacy numpy APIs 33 | 34 | #ifndef INCLUDE_NUMPY_LEGACY 35 | #define INCLUDE_NUMPY_LEGACY 0 36 | #endif 37 | #if INCLUDE_NUMPY_LEGACY 38 | #ifdef INCLUDE_NUMPY 39 | #undef INCLUDE_NUMPY 40 | #endif 41 | #define INCLUDE_NUMPY 1 42 | #else 43 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION 44 | #endif 45 | #ifndef INCLUDE_NUMPY 46 | #define INCLUDE_NUMPY 0 47 | #endif 48 | #if INCLUDE_NUMPY 49 | #include 50 | #endif 51 | 52 | 53 | static size_t INDEX_DTYPE_MAX = ((INDEX_DTYPE) ~(INDEX_DTYPE)0); 54 | 55 | 56 | typedef struct { 57 | char* data; 58 | size_t size; 59 | size_t capacity; 60 | } LazyCSV_Buffer; 61 | 62 | 63 | typedef struct { 64 | PyObject* empty; 65 | PyObject** items; 66 | } LazyCSV_Cache; 67 | 68 | 69 | typedef struct { 70 | size_t col; 71 | size_t value; 72 | } LazyCSV_AnchorPoint; 73 | 74 | 75 | typedef struct { 76 | size_t index; 77 | size_t count; 78 | } LazyCSV_RowIndex; 79 | 80 | 81 | typedef struct { 82 | int fd; 83 | struct stat st; 84 | char* name; 85 | char* data; 86 | } LazyCSV_File; 87 | 88 | 89 | typedef struct { 90 | PyObject* dir; 91 | LazyCSV_File* commas; 92 | LazyCSV_File* anchors; 93 | LazyCSV_File* newlines; 94 | } LazyCSV_Index; 95 | 96 | 97 | typedef struct { 98 | PyObject_HEAD 99 | PyObject* headers; 100 | PyObject* name; 101 | size_t rows; 102 | size_t cols; 103 | int _skip_headers; 104 | int _unquote; 105 | char _quotechar; 106 | char _newline; 107 | LazyCSV_Index* _index; 108 | LazyCSV_File* _data; 109 | LazyCSV_Cache* _cache; 110 | } LazyCSV; 111 | 112 | 113 | typedef struct { 114 | PyObject_HEAD 115 | PyObject* lazy; 116 | size_t row; 117 | size_t col; 118 | size_t position; 119 | size_t stop; 120 | size_t step; 121 | char reversed; 122 | } LazyCSV_Iter; 123 | 124 | 125 | static inline ssize_t LazyCSV_BufferWrite(int fd, LazyCSV_Buffer *buffer, 126 | void *data, size_t size) { 127 | 128 | ssize_t bytes_written = 0; 129 | 130 | if (buffer->size + size >= buffer->capacity) { 131 | bytes_written = write(fd, buffer->data, buffer->size); 132 | buffer->size = 0; 133 | } 134 | memcpy(&buffer->data[buffer->size], data, size); 135 | buffer->size += size; 136 | return bytes_written; 137 | } 138 | 139 | 140 | static inline void LazyCSV_BufferCache(LazyCSV_Buffer *buffer, void *data, 141 | size_t size) { 142 | 143 | if (size == 0) return; 144 | 145 | if (buffer->size + size >= buffer->capacity) { 146 | buffer->capacity += size; 147 | buffer->capacity *= 1.3; 148 | buffer->data = realloc(buffer->data, buffer->capacity); 149 | } 150 | memcpy(&buffer->data[buffer->size], data, size); 151 | buffer->size += size; 152 | } 153 | 154 | 155 | static inline ssize_t LazyCSV_BufferFlush(int comma_file, LazyCSV_Buffer *buffer) { 156 | ssize_t bytes_written = 0; 157 | bytes_written = write(comma_file, buffer->data, buffer->size); 158 | if (bytes_written < 0) { 159 | return bytes_written; 160 | } 161 | buffer->size = 0; 162 | fsync(comma_file); 163 | return bytes_written; 164 | } 165 | 166 | 167 | static inline ssize_t LazyCSV_ValueToDisk(size_t value, LazyCSV_RowIndex *ridx, 168 | LazyCSV_AnchorPoint *apnt, 169 | size_t col_index, int cfile, 170 | LazyCSV_Buffer *cbuf, int afile, 171 | LazyCSV_Buffer *abuf) { 172 | 173 | size_t target = value - apnt->value; 174 | 175 | if (target > INDEX_DTYPE_MAX) { 176 | *apnt = (LazyCSV_AnchorPoint){.value = value, .col = col_index+1}; 177 | ssize_t bytes_written = LazyCSV_BufferWrite(afile, abuf, apnt, sizeof(LazyCSV_AnchorPoint)); 178 | if (bytes_written < 0) 179 | return bytes_written; 180 | ridx->count += 1; 181 | target = 0; 182 | } 183 | 184 | INDEX_DTYPE item = target; 185 | 186 | return LazyCSV_BufferWrite(cfile, cbuf, &item, sizeof(INDEX_DTYPE)); 187 | } 188 | 189 | 190 | static inline size_t LazyCSV_AnchorValueFromValue(size_t value, 191 | LazyCSV_AnchorPoint *amap, 192 | LazyCSV_RowIndex *ridx) { 193 | 194 | LazyCSV_AnchorPoint *apnt = amap + ridx->count - 1; 195 | 196 | if (value >= apnt->col) { 197 | // we hit this if there is only one anchor point, or we're iterating 198 | // over the last anchor point. 199 | return apnt->value; 200 | } 201 | 202 | LazyCSV_AnchorPoint* apntp1; 203 | size_t L = 0, R = ridx->count-1; 204 | 205 | while (L <= R) { 206 | size_t M = L + ((R - L)/2); 207 | apnt = amap + M; 208 | apntp1 = apnt + 1; 209 | if (value > apntp1->col) { 210 | L = M + 1; 211 | } 212 | else if (value < apnt->col) { 213 | R = M - 1; 214 | } 215 | else if (value == apntp1->col) { 216 | return apntp1->value; 217 | } 218 | else { 219 | return apnt->value; 220 | } 221 | } 222 | return SIZE_MAX; 223 | } 224 | 225 | 226 | static inline size_t LazyCSV_ValueFromIndex(size_t value, 227 | LazyCSV_RowIndex *ridx, char *cmap, 228 | char *amap) { 229 | 230 | size_t cval = *(INDEX_DTYPE *)(cmap + (value * sizeof(INDEX_DTYPE))); 231 | size_t aval = 232 | LazyCSV_AnchorValueFromValue(value, (LazyCSV_AnchorPoint *)amap, ridx); 233 | return aval == SIZE_MAX ? aval : cval + aval; 234 | } 235 | 236 | 237 | static inline void LazyCSV_IterCol(LazyCSV_Iter *iter, size_t *offset, 238 | size_t *len) { 239 | 240 | LazyCSV *lazy = (LazyCSV *)iter->lazy; 241 | 242 | if (iter->position < iter->stop) { 243 | size_t position = 244 | iter->reversed 245 | ? lazy->rows - 1 - iter->position + !lazy->_skip_headers 246 | : iter->position + !lazy->_skip_headers; 247 | 248 | iter->position += iter->step; 249 | 250 | char* newlines = lazy->_index->newlines->data; 251 | char* anchors = lazy->_index->anchors->data; 252 | char* commas = lazy->_index->commas->data; 253 | 254 | LazyCSV_RowIndex* ridx = 255 | (LazyCSV_RowIndex*) 256 | (newlines + position*sizeof(LazyCSV_RowIndex)); 257 | 258 | char* aidx = anchors+ridx->index; 259 | char* cidx = commas+((lazy->cols+1)*position*sizeof(INDEX_DTYPE)); 260 | 261 | size_t cs = LazyCSV_ValueFromIndex(iter->col, ridx, cidx, aidx); 262 | size_t ce = LazyCSV_ValueFromIndex(iter->col + 1, ridx, cidx, aidx); 263 | 264 | *len = ce - cs - 1; 265 | *offset = cs; 266 | } 267 | } 268 | 269 | 270 | static inline void LazyCSV_IterRow(LazyCSV_Iter *iter, size_t *offset, 271 | size_t *len) { 272 | 273 | LazyCSV *lazy = (LazyCSV *)iter->lazy; 274 | 275 | if (iter->position < iter->stop) { 276 | size_t position = 277 | iter->reversed ? lazy->cols - iter->position - 1 : iter->position; 278 | 279 | iter->position += iter->step; 280 | 281 | char* newlines = lazy->_index->newlines->data; 282 | char* anchors = lazy->_index->anchors->data; 283 | char* commas = lazy->_index->commas->data; 284 | 285 | size_t row = iter->row + !lazy->_skip_headers; 286 | 287 | LazyCSV_RowIndex* ridx = 288 | (LazyCSV_RowIndex*) 289 | (newlines + row*sizeof(LazyCSV_RowIndex)); 290 | 291 | char *aidx = anchors + ridx->index; 292 | char *cidx = commas + ((lazy->cols + 1) * row * sizeof(INDEX_DTYPE)); 293 | 294 | size_t cs = LazyCSV_ValueFromIndex(position, ridx, cidx, aidx); 295 | size_t ce = LazyCSV_ValueFromIndex(position + 1, ridx, cidx, aidx); 296 | 297 | *len = ce - cs - 1; 298 | *offset = cs; 299 | } 300 | } 301 | 302 | 303 | static inline PyObject *PyBytes_FromOffsetAndLen(LazyCSV *lazy, size_t offset, 304 | size_t len) { 305 | 306 | PyObject* result; 307 | char* addr; 308 | 309 | switch (len) { 310 | case SIZE_MAX: 311 | case 0: 312 | // short circuit if result is empty string 313 | result = lazy->_cache->empty; 314 | Py_INCREF(result); 315 | break; 316 | case 1: 317 | addr = lazy->_data->data + offset; 318 | unsigned char index = *addr; // explicit unsigned char for indexing purposes, 319 | // *(char*) signed-ness is ambiguous and on some 320 | // architectures it results in a negative number 321 | result = lazy->_cache->items[index]; 322 | Py_INCREF(result); 323 | break; 324 | default: 325 | addr = lazy->_data->data + offset; 326 | 327 | char strip_quotes = ( 328 | lazy->_unquote 329 | && addr[0] == lazy->_quotechar 330 | && addr[len-1] == lazy->_quotechar 331 | ); 332 | 333 | if (strip_quotes) { 334 | addr = addr+1; 335 | len = len-2; 336 | } 337 | 338 | result = PyBytes_FromStringAndSize(addr, len); 339 | 340 | if (!result) { 341 | PyErr_SetString( 342 | PyExc_RuntimeError, 343 | "could not allocate memory for new object" 344 | ); 345 | return NULL; 346 | } 347 | } 348 | 349 | return result; 350 | } 351 | 352 | 353 | static PyObject* LazyCSV_IterNext(PyObject* self) { 354 | LazyCSV_Iter* iter = (LazyCSV_Iter*)self; 355 | LazyCSV *lazy = (LazyCSV *)iter->lazy; 356 | 357 | size_t offset = SIZE_MAX, len; 358 | 359 | switch ((iter->row == SIZE_MAX) - (iter->col == SIZE_MAX)) { 360 | case -1: 361 | LazyCSV_IterRow(iter, &offset, &len); 362 | break; 363 | case +1: 364 | LazyCSV_IterCol(iter, &offset, &len); 365 | break; 366 | default: 367 | PyErr_SetString( 368 | PyExc_RuntimeError, 369 | "could not determine axis for materialization" 370 | ); 371 | return NULL; 372 | } 373 | 374 | if (offset==SIZE_MAX) { 375 | PyErr_SetNone(PyExc_StopIteration); 376 | return NULL; 377 | } 378 | 379 | return PyBytes_FromOffsetAndLen(lazy, offset, len); 380 | } 381 | 382 | 383 | static PyObject* LazyCSV_IterAsList(PyObject* self) { 384 | LazyCSV_Iter* iter = (LazyCSV_Iter*)self; 385 | LazyCSV* lazy = (LazyCSV*)iter->lazy; 386 | 387 | size_t size; 388 | size_t iter_col = iter->col; 389 | size_t iter_row = iter->row; 390 | 391 | if (iter_col == SIZE_MAX) { 392 | size = lazy->cols - iter->position; 393 | } 394 | else if (iter_row == SIZE_MAX) { 395 | size = lazy->rows - iter->position; 396 | } 397 | else { 398 | PyErr_SetString( 399 | PyExc_RuntimeError, 400 | "could not determine axis for materialization" 401 | ); 402 | return NULL; 403 | } 404 | 405 | PyObject* result = PyList_New(size); 406 | if (!result) { 407 | PyErr_SetString( 408 | PyExc_RuntimeError, 409 | "could not allocate memory for new list" 410 | ); 411 | return NULL; 412 | } 413 | size_t offset=SIZE_MAX, len=0; 414 | 415 | PyObject* item; 416 | for (size_t i = 0; i < size; i++) { 417 | switch (iter_col) { 418 | case SIZE_MAX: 419 | LazyCSV_IterRow(iter, &offset, &len); 420 | break; 421 | default: 422 | LazyCSV_IterCol(iter, &offset, &len); 423 | } 424 | item = PyBytes_FromOffsetAndLen(lazy, offset, len); 425 | if (!item) { 426 | // err msg set in PyBytes_FromOffsetAndLen, 427 | // just need to decref the list here 428 | Py_DECREF(result); 429 | return item; 430 | } 431 | PyList_SET_ITEM(result, i, item); 432 | } 433 | 434 | return result; 435 | } 436 | 437 | 438 | #if INCLUDE_NUMPY 439 | static PyObject* LazyCSV_IterAsNumpy(PyObject* self) { 440 | LazyCSV_Iter* iter = (LazyCSV_Iter*)self; 441 | LazyCSV* lazy = (LazyCSV*)iter->lazy; 442 | 443 | size_t size; 444 | size_t iter_col = iter->col; 445 | size_t iter_row = iter->row; 446 | 447 | if (iter_col != SIZE_MAX) { 448 | size = lazy->rows - iter->position; 449 | } 450 | else if (iter_row != SIZE_MAX) { 451 | size = lazy->cols - iter->position; 452 | } 453 | else { 454 | PyErr_SetString( 455 | PyExc_RuntimeError, 456 | "could not determine axis for materialization" 457 | ); 458 | return NULL; 459 | } 460 | 461 | size_t buffer_capacity = 65536; // 2**16 462 | LazyCSV_Buffer buffer = {.data = malloc(buffer_capacity), 463 | .size = 0, 464 | .capacity = buffer_capacity}; 465 | 466 | size_t offset, len=0, max_len=0; 467 | char* addr; 468 | 469 | for (size_t i=0; i < size; i++) { 470 | switch (iter_col) { 471 | case SIZE_MAX: 472 | LazyCSV_IterRow(iter, &offset, &len); 473 | break; 474 | default: 475 | LazyCSV_IterCol(iter, &offset, &len); 476 | } 477 | addr = lazy->_data->data + offset; 478 | LazyCSV_BufferCache(&buffer, &len, sizeof(size_t)); 479 | LazyCSV_BufferCache(&buffer, addr, len); 480 | max_len = len > max_len ? len : max_len; 481 | } 482 | 483 | npy_intp const dimensions[1] = {size, }; 484 | npy_intp const strides[1] = {max_len, }; 485 | 486 | PyArrayObject *arr = 487 | (PyArrayObject *)PyArray_New(&PyArray_Type, 1, dimensions, NPY_STRING, 488 | strides, NULL, max_len, 0, NULL); 489 | 490 | if (!arr) { 491 | free(buffer.data); 492 | PyErr_SetString( 493 | PyExc_RuntimeError, 494 | "could not allocate numpy array" 495 | ); 496 | return NULL; 497 | } 498 | 499 | char* tempbuf = buffer.data; 500 | char* arrdata = PyArray_DATA(arr); 501 | 502 | for (size_t i = 0; i < size; i++) { 503 | len = *(size_t *)tempbuf; 504 | tempbuf += sizeof(size_t); 505 | size_t padlen = max_len - len; 506 | strncpy(arrdata, tempbuf, len); 507 | tempbuf += len; 508 | arrdata += len; 509 | memset(arrdata, 0, padlen); 510 | arrdata += padlen; 511 | } 512 | 513 | free(buffer.data); 514 | 515 | return PyArray_Return(arr); 516 | } 517 | #endif 518 | 519 | 520 | static PyObject* LazyCSV_IterSelf(PyObject* self) { 521 | Py_INCREF(self); 522 | return self; 523 | } 524 | 525 | 526 | static void LazyCSV_IterDestruct(LazyCSV_Iter* self) { 527 | Py_DECREF(self->lazy); 528 | Py_TYPE(self)->tp_free((PyObject*)self); 529 | } 530 | 531 | 532 | static PyMethodDef LazyCSV_IterMethods[] = { 533 | #if INCLUDE_NUMPY 534 | { 535 | "to_numpy", 536 | (PyCFunction)LazyCSV_IterAsNumpy, 537 | METH_NOARGS, 538 | "materialize iterator into a numpy array" 539 | }, 540 | #endif 541 | { 542 | "to_list", 543 | (PyCFunction)LazyCSV_IterAsList, 544 | METH_NOARGS, 545 | "materialize iterator into a list" 546 | }, 547 | {NULL, } 548 | }; 549 | 550 | 551 | static PyTypeObject LazyCSV_IterType = { 552 | PyVarObject_HEAD_INIT(NULL, 0) 553 | .tp_name = "lazycsv_iterator", 554 | .tp_itemsize = sizeof(LazyCSV_Iter), 555 | .tp_dealloc = (destructor)LazyCSV_IterDestruct, 556 | .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, 557 | .tp_doc = "LazyCSV iterable", 558 | .tp_methods = LazyCSV_IterMethods, 559 | .tp_iter = LazyCSV_IterSelf, 560 | .tp_iternext = LazyCSV_IterNext, 561 | }; 562 | 563 | 564 | static inline void LazyCSV_TempDirAsString(PyObject **tempdir, char **dirname) { 565 | PyObject *tempfile = PyImport_ImportModule("tempfile"); 566 | PyObject *tempdir_obj = 567 | PyObject_GetAttrString(tempfile, "TemporaryDirectory"); 568 | 569 | *tempdir = PyObject_CallObject(tempdir_obj, NULL); 570 | PyObject* dirname_obj = PyObject_GetAttrString(*tempdir, "name"); 571 | PyObject* dirstring = PyUnicode_AsUTF8String(dirname_obj); 572 | *dirname = PyBytes_AsString(dirstring); 573 | 574 | Py_DECREF(tempfile); 575 | Py_DECREF(tempdir_obj); 576 | Py_DECREF(dirname_obj); 577 | Py_DECREF(dirstring); 578 | } 579 | 580 | 581 | static inline void LazyCSV_FullNameFromName(PyObject *name, 582 | PyObject **fullname_obj, 583 | char **fullname) { 584 | 585 | PyObject *os_path = PyImport_ImportModule("os.path"); 586 | PyObject *isfile = PyObject_CallMethod(os_path, "isfile", "O", name); 587 | 588 | PyObject *builtins = PyImport_ImportModule("builtins"); 589 | PyObject* global_vars = PyObject_CallMethod(builtins, "globals", NULL); 590 | 591 | // borrowed ref 592 | PyObject* __file__ = PyDict_GetItemString(global_vars, "__file__"); 593 | 594 | if (isfile == Py_True) { 595 | // owned reference which we keep 596 | *fullname_obj = PyObject_CallMethod(os_path, "abspath", "O", name); 597 | *fullname = PyBytes_AsString(*fullname_obj); 598 | } 599 | 600 | else if (__file__) { 601 | // also check to see if file is relative to the caller if not 602 | // previously found 603 | PyObject *_dirname = 604 | PyObject_CallMethod(os_path, "dirname", "O", __file__); 605 | 606 | PyObject *dirname = PyUnicode_AsUTF8String(_dirname); 607 | 608 | PyObject *joined = 609 | PyObject_CallMethod(os_path, "join", "(OO)", dirname, name); 610 | 611 | *fullname_obj = PyObject_CallMethod(os_path, "abspath", "O", joined); 612 | *fullname = PyBytes_AsString(*fullname_obj); 613 | 614 | Py_DECREF(joined); 615 | Py_DECREF(_dirname); 616 | Py_DECREF(dirname); 617 | } 618 | 619 | Py_DECREF(os_path); 620 | Py_DECREF(isfile); 621 | Py_DECREF(builtins); 622 | Py_DECREF(global_vars); 623 | } 624 | 625 | 626 | static PyObject *LazyCSV_New(PyTypeObject *type, PyObject *args, 627 | PyObject *kwargs) { 628 | 629 | PyObject* name; 630 | int skip_headers = 0; 631 | int unquote = 1; 632 | Py_ssize_t buffer_capacity = 2097152; // 2**21 633 | char *dirname, *delimiter = ",", *quotechar = "\""; 634 | ssize_t bytes_written = 0; 635 | 636 | static char* kwlist[] = { 637 | "", "delimiter", "quotechar", "skip_headers", "unquote", "buffer_size", "index_dir", NULL 638 | }; 639 | 640 | char ok = PyArg_ParseTupleAndKeywords( 641 | args, kwargs, "O|ssppns", kwlist, &name, &delimiter, "echar, 642 | &skip_headers, &unquote, &buffer_capacity, &dirname); 643 | 644 | if (!ok) { 645 | PyErr_SetString( 646 | PyExc_ValueError, 647 | "unable to parse function arguments" 648 | ); 649 | return NULL; 650 | } 651 | 652 | if (buffer_capacity < 0) { 653 | PyErr_SetString( 654 | PyExc_ValueError, 655 | "buffer size cannot be less than 0" 656 | ); 657 | return NULL; 658 | } 659 | 660 | Py_INCREF(name); 661 | if (PyUnicode_CheckExact(name)) { 662 | PyObject* _name = PyUnicode_AsUTF8String(name); 663 | Py_DECREF(name); 664 | name = _name; 665 | } 666 | 667 | if (!PyBytes_CheckExact(name)) { 668 | PyErr_SetString( 669 | PyExc_ValueError, 670 | "first argument must be str or bytes" 671 | ); 672 | Py_DECREF(name); 673 | return NULL; 674 | } 675 | 676 | PyObject* fullname_obj = NULL; 677 | char* fullname = NULL; 678 | LazyCSV_FullNameFromName(name, &fullname_obj, &fullname); 679 | 680 | if (!fullname_obj) { 681 | PyErr_SetString( 682 | PyExc_MemoryError, 683 | "unable to initialize filepath object" 684 | ); 685 | return NULL; 686 | } 687 | 688 | if (!fullname) { 689 | Py_XDECREF(fullname_obj); 690 | PyErr_SetString( 691 | PyExc_MemoryError, 692 | "unable to determine file path from *PyObject" 693 | ); 694 | return NULL; 695 | } 696 | 697 | Py_DECREF(name); 698 | 699 | int ufd = open(fullname, O_RDONLY); 700 | if (ufd == -1) { 701 | PyErr_SetString( 702 | PyExc_FileNotFoundError, 703 | "unable to open data file," 704 | " check to be sure that the user has read permissions" 705 | " and/or ownership of the file, and that the file exists." 706 | ); 707 | return NULL; 708 | } 709 | 710 | struct stat ust; 711 | if (fstat(ufd, &ust) < 0) { 712 | PyErr_SetString( 713 | PyExc_RuntimeError, 714 | "unable to stat user file" 715 | ); 716 | goto close_ufd; 717 | } 718 | 719 | size_t file_len = ust.st_size; 720 | 721 | int mmap_flags = PROT_READ; 722 | char* file = mmap(NULL, file_len, mmap_flags, MAP_PRIVATE, ufd, 0); 723 | 724 | PyObject* tempdir = NULL; 725 | if (!dirname) { 726 | LazyCSV_TempDirAsString(&tempdir, &dirname); 727 | } 728 | 729 | char* comma_index = tempnam(dirname, "LzyC_"); 730 | char* anchor_index = tempnam(dirname, "LzyA_"); 731 | char* newline_index = tempnam(dirname, "LzyN_"); 732 | 733 | int file_flags = O_WRONLY|O_CREAT|O_EXCL; 734 | 735 | int comma_file = open(comma_index, file_flags, S_IRWXU); 736 | if (comma_file < 0) 737 | goto close_comma_file; 738 | 739 | int anchor_file = open(anchor_index, file_flags, S_IRWXU); 740 | if (anchor_file < 0) 741 | goto close_anchor_file; 742 | 743 | int newline_file = open(newline_index, file_flags, S_IRWXU); 744 | if (newline_file < 0) 745 | goto close_newline_file; 746 | 747 | char quoted = 0, c, cm1 = LINE_FEED, cm2 = 0; 748 | size_t rows = 0, cols = SIZE_MAX, row_index = 0, col_index = 0; 749 | 750 | int newline = -1; 751 | 752 | // overflow happens when a row has more columns than the header row, 753 | // if this happens during the parse, the comma of the nth col will indicate 754 | // the line ending. Underflow happens when a row has less columns than the 755 | // header row, and missing values will be appended to the row as an empty 756 | // field. 757 | 758 | size_t overflow = SIZE_MAX; 759 | char *overflow_warning = NULL, *underflow_warning = NULL; 760 | 761 | LazyCSV_RowIndex ridx = {.index = 0, .count = 0}; 762 | 763 | LazyCSV_AnchorPoint apnt; 764 | 765 | LazyCSV_Buffer comma_buffer = {.data = malloc(buffer_capacity), 766 | .size = 0, 767 | .capacity = buffer_capacity}; 768 | 769 | LazyCSV_Buffer anchor_buffer = {.data = malloc(buffer_capacity), 770 | .size = 0, 771 | .capacity = buffer_capacity}; 772 | 773 | LazyCSV_Buffer newline_buffer = {.data = malloc(buffer_capacity), 774 | .size = 0, 775 | .capacity = buffer_capacity}; 776 | 777 | for (size_t i = 0; i < file_len; i++) { 778 | 779 | if (overflow != SIZE_MAX && i < overflow) { 780 | continue; 781 | } 782 | 783 | c = file[i]; 784 | 785 | if (col_index == 0 786 | && (cm1 == LINE_FEED || cm1 == CARRIAGE_RETURN) 787 | && cm2 != CARRIAGE_RETURN) { 788 | size_t val = ( 789 | newline == (CARRIAGE_RETURN+LINE_FEED) 790 | ) ? i + 1 : i; 791 | 792 | apnt = (LazyCSV_AnchorPoint){.value = val, .col = col_index}; 793 | 794 | LazyCSV_BufferWrite(anchor_file, &anchor_buffer, &apnt, 795 | sizeof(LazyCSV_AnchorPoint)); 796 | 797 | ridx.index += ridx.count*sizeof(LazyCSV_AnchorPoint); 798 | ridx.count = 1; 799 | 800 | bytes_written = LazyCSV_ValueToDisk(val, &ridx, &apnt, col_index, comma_file, 801 | &comma_buffer, anchor_file, &anchor_buffer); 802 | if (bytes_written < 0) 803 | goto close_newline_file; 804 | } 805 | 806 | if (c == *quotechar) { 807 | quoted = !quoted; 808 | } 809 | 810 | else if (!quoted && c == *delimiter) { 811 | size_t val = i + 1; 812 | bytes_written = LazyCSV_ValueToDisk(val, &ridx, &apnt, col_index, comma_file, 813 | &comma_buffer, anchor_file, &anchor_buffer); 814 | if (bytes_written < 0) 815 | goto close_newline_file; 816 | if (cols == SIZE_MAX || col_index < cols) { 817 | col_index += 1; 818 | } 819 | else { 820 | overflow_warning = 821 | "column overflow encountered while parsing CSV, " 822 | "extra values will be truncated!"; 823 | overflow = i; 824 | 825 | for (;;) { 826 | if (file[overflow] == LINE_FEED || 827 | file[overflow] == CARRIAGE_RETURN) 828 | break; 829 | else if (overflow >= file_len) 830 | break; 831 | overflow += 1; 832 | } 833 | } 834 | } 835 | 836 | else if (!quoted && c == LINE_FEED && cm1 == CARRIAGE_RETURN) { 837 | // no-op, don't match next block for \r\n 838 | } 839 | 840 | else if (!quoted && (c == CARRIAGE_RETURN || c == LINE_FEED)) { 841 | size_t val = i + 1; 842 | 843 | if (overflow == SIZE_MAX) { 844 | bytes_written = LazyCSV_ValueToDisk(val, &ridx, &apnt, col_index, comma_file, 845 | &comma_buffer, anchor_file, &anchor_buffer); 846 | if (bytes_written < 0) 847 | goto close_newline_file; 848 | } 849 | else { 850 | overflow = SIZE_MAX; 851 | } 852 | 853 | if (row_index == 0) { 854 | cols = col_index; 855 | } 856 | 857 | else if (col_index < cols) { 858 | underflow_warning = 859 | "column underflow encountered while parsing CSV, " 860 | "missing values will be filled with the empty bytestring!"; 861 | while (col_index < cols) { 862 | bytes_written = LazyCSV_ValueToDisk(val, &ridx, &apnt, col_index, comma_file, 863 | &comma_buffer, anchor_file, 864 | &anchor_buffer); 865 | if (bytes_written < 0) 866 | goto close_newline_file; 867 | col_index += 1; 868 | } 869 | } 870 | 871 | if (newline == -1) { 872 | newline = (c == CARRIAGE_RETURN && file[i + 1] == LINE_FEED) 873 | ? LINE_FEED + CARRIAGE_RETURN 874 | : c; 875 | } 876 | 877 | LazyCSV_BufferWrite(newline_file, &newline_buffer, &ridx, 878 | sizeof(LazyCSV_RowIndex)); 879 | 880 | col_index = 0; 881 | row_index += 1; 882 | } 883 | 884 | cm2 = cm1; 885 | cm1 = c; 886 | } 887 | 888 | char last_char = file[file_len - 1]; 889 | char overcount = last_char == CARRIAGE_RETURN || last_char == LINE_FEED; 890 | 891 | if (!overcount) { 892 | bytes_written = LazyCSV_ValueToDisk(file_len + 1, &ridx, &apnt, col_index, comma_file, 893 | &comma_buffer, anchor_file, &anchor_buffer); 894 | if (bytes_written < 0) 895 | goto close_newline_file; 896 | 897 | bytes_written = LazyCSV_BufferWrite(newline_file, &newline_buffer, &ridx, 898 | sizeof(LazyCSV_RowIndex)); 899 | 900 | if (bytes_written < 0) 901 | goto close_newline_file; 902 | } 903 | 904 | if (overflow_warning) 905 | PyErr_WarnEx( 906 | PyExc_RuntimeWarning, 907 | overflow_warning, 908 | 1 909 | ); 910 | 911 | if (underflow_warning) 912 | PyErr_WarnEx( 913 | PyExc_RuntimeWarning, 914 | underflow_warning, 915 | 1 916 | ); 917 | 918 | rows = row_index - overcount + skip_headers; 919 | cols = cols + 1; 920 | 921 | bytes_written = LazyCSV_BufferFlush(comma_file, &comma_buffer); 922 | if (bytes_written < 0) 923 | goto close_newline_file; 924 | bytes_written = LazyCSV_BufferFlush(anchor_file, &anchor_buffer); 925 | if (bytes_written < 0) 926 | goto close_newline_file; 927 | bytes_written = LazyCSV_BufferFlush(newline_file, &newline_buffer); 928 | if (bytes_written < 0) 929 | goto close_newline_file; 930 | 931 | close(comma_file); 932 | close(anchor_file); 933 | close(newline_file); 934 | 935 | free(comma_buffer.data); 936 | free(anchor_buffer.data); 937 | free(newline_buffer.data); 938 | 939 | comma_file = open(comma_index, O_RDWR); 940 | struct stat comma_st; 941 | if (fstat(comma_file, &comma_st) < 0) { 942 | PyErr_SetString( 943 | PyExc_RuntimeError, 944 | "unable to stat comma file" 945 | ); 946 | goto close_comma_file; 947 | } 948 | 949 | anchor_file = open(anchor_index, O_RDWR); 950 | struct stat anchor_st; 951 | if (fstat(anchor_file, &anchor_st) < 0) { 952 | PyErr_SetString( 953 | PyExc_RuntimeError, 954 | "unable to stat anchor file" 955 | ); 956 | goto close_anchor_file; 957 | } 958 | 959 | newline_file = open(newline_index, O_RDWR); 960 | struct stat newline_st; 961 | if (fstat(newline_file, &newline_st) < 0) { 962 | PyErr_SetString( 963 | PyExc_RuntimeError, 964 | "unable to stat newline file" 965 | ); 966 | goto close_newline_file; 967 | } 968 | 969 | char *comma_memmap = 970 | mmap(NULL, comma_st.st_size, mmap_flags, MAP_PRIVATE, comma_file, 0); 971 | char *anchor_memmap = 972 | mmap(NULL, anchor_st.st_size, mmap_flags, MAP_PRIVATE, anchor_file, 0); 973 | char *newline_memmap = 974 | mmap(NULL, newline_st.st_size, mmap_flags, MAP_PRIVATE, newline_file, 0); 975 | 976 | PyObject* headers; 977 | 978 | if (!skip_headers) { 979 | headers = PyTuple_New(cols); 980 | LazyCSV_RowIndex* ridx = (LazyCSV_RowIndex*)newline_memmap; 981 | 982 | size_t cs, ce; 983 | size_t len; 984 | char *addr; 985 | for (size_t i = 0; i < cols; i++) { 986 | cs = LazyCSV_ValueFromIndex(i, ridx, comma_memmap, anchor_memmap); 987 | ce = LazyCSV_ValueFromIndex(i + 1, ridx, comma_memmap, 988 | anchor_memmap); 989 | 990 | if (ce - cs == 1) { 991 | PyTuple_SET_ITEM(headers, i, PyBytes_FromString("")); 992 | } 993 | else { 994 | addr = file + cs; 995 | 996 | len = ce - cs - 1; 997 | if (unquote 998 | && addr[0] == *quotechar 999 | && addr[len-1] == *quotechar) { 1000 | addr = addr+1; 1001 | len = len-2; 1002 | } 1003 | PyTuple_SET_ITEM( 1004 | headers, i, PyBytes_FromStringAndSize(addr, len) 1005 | ); 1006 | } 1007 | } 1008 | } 1009 | else { 1010 | headers = PyTuple_New(0); 1011 | } 1012 | 1013 | LazyCSV* self = (LazyCSV*)type->tp_alloc(type, 0); 1014 | if (!self) { 1015 | PyErr_SetString( 1016 | PyExc_MemoryError, 1017 | "unable to allocate LazyCSV object" 1018 | ); 1019 | goto unmap_memmaps; 1020 | } 1021 | 1022 | LazyCSV_Cache* _cache = malloc(sizeof(LazyCSV_Cache)); 1023 | _cache->empty = PyBytes_FromString(""); 1024 | _cache->items = malloc((UCHAR_MAX+1)*sizeof(PyObject*)); 1025 | 1026 | for (size_t i = 0; i <= UCHAR_MAX; i++) 1027 | _cache->items[i] = PyBytes_FromFormat("%c", (int)i); 1028 | 1029 | LazyCSV_File* _commas = malloc(sizeof(LazyCSV_File)); 1030 | _commas->name = comma_index; 1031 | _commas->data = comma_memmap; 1032 | _commas->st = comma_st; 1033 | _commas->fd = comma_file; 1034 | 1035 | LazyCSV_File* _anchors = malloc(sizeof(LazyCSV_File)); 1036 | _anchors->name = anchor_index; 1037 | _anchors->data = anchor_memmap; 1038 | _anchors->st = anchor_st; 1039 | _anchors->fd = anchor_file; 1040 | 1041 | LazyCSV_File* _newlines = malloc(sizeof(LazyCSV_File)); 1042 | _newlines->name = newline_index; 1043 | _newlines->data = newline_memmap; 1044 | _newlines->st = newline_st; 1045 | _newlines->fd = newline_file; 1046 | 1047 | LazyCSV_Index* _index = malloc(sizeof(LazyCSV_Index)); 1048 | 1049 | _index->dir = tempdir; 1050 | _index->commas = _commas; 1051 | _index->newlines = _newlines; 1052 | _index->anchors = _anchors; 1053 | 1054 | LazyCSV_File* _data = malloc(sizeof(LazyCSV_File)); 1055 | _data->name = fullname; 1056 | _data->fd = ufd; 1057 | _data->data = file; 1058 | _data->st = ust; 1059 | 1060 | self->rows = rows; 1061 | self->cols = cols; 1062 | self->name = fullname_obj; 1063 | self->headers = headers; 1064 | self->_skip_headers = skip_headers; 1065 | self->_unquote = unquote; 1066 | self->_quotechar = *quotechar; 1067 | self->_newline = newline; 1068 | self->_index = _index; 1069 | self->_data = _data; 1070 | self->_cache = _cache; 1071 | 1072 | return (PyObject*)self; 1073 | 1074 | unmap_memmaps: 1075 | munmap(comma_memmap, comma_st.st_size); 1076 | munmap(anchor_memmap, anchor_st.st_size); 1077 | munmap(newline_memmap, newline_st.st_size); 1078 | Py_DECREF(headers); 1079 | 1080 | close_newline_file: 1081 | close(newline_file); 1082 | 1083 | close_anchor_file: 1084 | close(anchor_file); 1085 | 1086 | close_comma_file: 1087 | close(comma_file); 1088 | munmap(file, ust.st_size); 1089 | Py_XDECREF(tempdir); 1090 | 1091 | close_ufd: 1092 | close(ufd); 1093 | return NULL; 1094 | } 1095 | 1096 | 1097 | static void LazyCSV_Destruct(LazyCSV* self) { 1098 | munmap(self->_data->data, self->_data->st.st_size); 1099 | munmap(self->_index->commas->data, self->_index->commas->st.st_size); 1100 | munmap(self->_index->anchors->data, self->_index->anchors->st.st_size); 1101 | munmap(self->_index->newlines->data, self->_index->newlines->st.st_size); 1102 | 1103 | close(self->_data->fd); 1104 | close(self->_index->commas->fd); 1105 | close(self->_index->anchors->fd); 1106 | close(self->_index->newlines->fd); 1107 | 1108 | remove(self->_index->commas->name); 1109 | remove(self->_index->anchors->name); 1110 | remove(self->_index->newlines->name); 1111 | 1112 | free(self->_index->commas->name); 1113 | free(self->_index->anchors->name); 1114 | free(self->_index->newlines->name); 1115 | 1116 | free(self->_index->commas); 1117 | free(self->_index->anchors); 1118 | free(self->_index->newlines); 1119 | 1120 | Py_XDECREF(self->_index->dir); 1121 | 1122 | Py_DECREF(self->_cache->empty); 1123 | for (size_t i = 0; i <= UCHAR_MAX; i++) 1124 | Py_DECREF(self->_cache->items[i]); 1125 | free(self->_cache->items); 1126 | 1127 | free(self->_data); 1128 | free(self->_index); 1129 | free(self->_cache); 1130 | 1131 | Py_DECREF(self->headers); 1132 | Py_DECREF(self->name); 1133 | 1134 | Py_TYPE(self)->tp_free((PyObject*)self); 1135 | } 1136 | 1137 | 1138 | static PyObject *LazyCSV_Seq(PyObject *self, PyObject *args, PyObject *kwargs) { 1139 | 1140 | size_t row = SIZE_MAX; 1141 | size_t col = SIZE_MAX; 1142 | size_t stop; 1143 | char reversed; 1144 | 1145 | static char *kwlist[] = {"row", "col", "reversed", NULL}; 1146 | 1147 | char ok = PyArg_ParseTupleAndKeywords( 1148 | args, kwargs, "|nnb", kwlist, &row, &col, &reversed 1149 | ); 1150 | 1151 | if (!ok) { 1152 | PyErr_SetString( 1153 | PyExc_ValueError, 1154 | "unable to parse lazy.sequence() arguments" 1155 | ); 1156 | return NULL; 1157 | } 1158 | 1159 | if (row == SIZE_MAX && col == SIZE_MAX) { 1160 | PyErr_SetString( 1161 | PyExc_ValueError, 1162 | "a row or a col value is required" 1163 | ); 1164 | return NULL; 1165 | } 1166 | 1167 | if (row != SIZE_MAX && col != SIZE_MAX) { 1168 | PyErr_SetString( 1169 | PyExc_ValueError, 1170 | "cannot specify both row and col" 1171 | ); 1172 | return NULL; 1173 | } 1174 | 1175 | if (col != SIZE_MAX) { 1176 | stop = ((LazyCSV*)self)->rows; 1177 | } 1178 | else if (row != SIZE_MAX) { 1179 | stop = ((LazyCSV*)self)->cols; 1180 | } 1181 | else { 1182 | PyErr_SetString( 1183 | PyExc_RuntimeError, 1184 | "could not determine axis for materialization" 1185 | ); 1186 | return NULL; 1187 | } 1188 | 1189 | PyTypeObject* type = &LazyCSV_IterType; 1190 | LazyCSV_Iter* iter = (LazyCSV_Iter*)type->tp_alloc(type, 0); 1191 | 1192 | if (!iter) { 1193 | PyErr_SetString( 1194 | PyExc_MemoryError, 1195 | "unable to allocate memory for iterable" 1196 | ); 1197 | return NULL; 1198 | } 1199 | 1200 | iter->row = row; 1201 | iter->col = col; 1202 | iter->reversed = reversed; 1203 | iter->position = 0; 1204 | iter->step = 1; 1205 | iter->stop = stop; 1206 | iter->lazy = self; 1207 | 1208 | Py_INCREF(self); 1209 | 1210 | return (PyObject*)iter; 1211 | } 1212 | 1213 | 1214 | static PyObject* LazyCSV_GetValue(PyObject* self, PyObject* r, PyObject* c) { 1215 | 1216 | Py_ssize_t _row = PyLong_AsSsize_t(r); 1217 | Py_ssize_t _col = PyLong_AsSsize_t(c); 1218 | 1219 | LazyCSV* lazy = (LazyCSV*)self; 1220 | 1221 | size_t row = _row < 0 ? lazy->rows + _row : (size_t)_row; 1222 | size_t col = _col < 0 ? lazy->cols + _col : (size_t)_col; 1223 | 1224 | int row_in_bounds = ( 1225 | 0 <= row && row < lazy->rows 1226 | ); 1227 | 1228 | int col_in_bounds = ( 1229 | 0 <= col && col < lazy->cols 1230 | ); 1231 | 1232 | if (!row_in_bounds || !col_in_bounds) { 1233 | PyErr_SetString( 1234 | PyExc_ValueError, 1235 | "provided value not in bounds of index" 1236 | ); 1237 | return NULL; 1238 | } 1239 | 1240 | row += !lazy->_skip_headers; 1241 | 1242 | char* newlines = lazy->_index->newlines->data; 1243 | char* anchors = lazy->_index->anchors->data; 1244 | char* commas = lazy->_index->commas->data; 1245 | 1246 | LazyCSV_RowIndex* ridx = 1247 | (LazyCSV_RowIndex*) 1248 | (newlines + row*sizeof(LazyCSV_RowIndex)); 1249 | 1250 | char* aidx = anchors+ridx->index; 1251 | char* cidx = commas+((lazy->cols+1)*row*sizeof(INDEX_DTYPE)); 1252 | 1253 | size_t cs = LazyCSV_ValueFromIndex((size_t)col, ridx, cidx, aidx); 1254 | size_t ce = LazyCSV_ValueFromIndex((size_t)col + 1, ridx, cidx, aidx); 1255 | 1256 | size_t len = ce - cs - 1; 1257 | 1258 | return PyBytes_FromOffsetAndLen(lazy, cs, len); 1259 | } 1260 | 1261 | 1262 | static PyObject* LazyCSV_GetItem(PyObject* self, PyObject* key) { 1263 | if (!PyTuple_Check(key)) { 1264 | PyErr_SetString( 1265 | PyExc_ValueError, 1266 | "index must contain both a row and column value" 1267 | ); 1268 | return NULL; 1269 | } 1270 | 1271 | PyObject *row_obj, *col_obj; 1272 | 1273 | if (!PyArg_ParseTuple(key, "OO", &row_obj, &col_obj)) { 1274 | PyErr_SetString( 1275 | PyExc_RuntimeError, 1276 | "unable to parse index key" 1277 | ); 1278 | return NULL; 1279 | } 1280 | 1281 | if (PyLong_Check(row_obj) && PyLong_Check(col_obj)) 1282 | return LazyCSV_GetValue(self, row_obj, col_obj); 1283 | 1284 | int row_is_slice = PySlice_Check(row_obj); 1285 | int col_is_slice = PySlice_Check(col_obj); 1286 | 1287 | LazyCSV* lazy = (LazyCSV*)self; 1288 | 1289 | if (row_is_slice && !col_is_slice) { 1290 | PySliceObject* row_slice = (PySliceObject*)row_obj; 1291 | 1292 | Py_ssize_t _col = PyLong_AsSsize_t(col_obj); 1293 | size_t col = _col < 0 ? lazy->cols + _col : (size_t)_col; 1294 | 1295 | int col_in_bounds = ( 1296 | 0 <= col && col < lazy->cols 1297 | ); 1298 | 1299 | if (!col_in_bounds) goto boundary_err; 1300 | 1301 | Py_ssize_t _start = row_slice->start == Py_None 1302 | ? (Py_ssize_t)0 1303 | : PyLong_AsSsize_t(row_slice->start); 1304 | Py_ssize_t _stop = row_slice->stop == Py_None 1305 | ? (Py_ssize_t)lazy->rows 1306 | : PyLong_AsSsize_t(row_slice->stop); 1307 | Py_ssize_t _step = 1308 | row_slice->step == Py_None ? 1 : PyLong_AsSsize_t(row_slice->step); 1309 | 1310 | size_t start = _start < 0 ? lazy->rows + _start : (size_t)_start; 1311 | size_t stop = _stop < 0 ? lazy->rows + _stop : (size_t)_stop; 1312 | 1313 | size_t step; 1314 | char reversed = 0; 1315 | 1316 | if (_step < 0) { 1317 | reversed = 1; 1318 | step = (size_t)(-1 * _step); 1319 | if (row_slice->start != Py_None) { 1320 | start = lazy->rows - start - 1; 1321 | } 1322 | if (row_slice->stop != Py_None) { 1323 | stop = lazy->rows - stop - 1; 1324 | } 1325 | } 1326 | else { 1327 | step = (size_t)_step; 1328 | } 1329 | 1330 | PyTypeObject* type = &LazyCSV_IterType; 1331 | LazyCSV_Iter* iter = (LazyCSV_Iter*)type->tp_alloc(type, 0); 1332 | if (!iter) goto memory_err; 1333 | 1334 | iter->row = SIZE_MAX; 1335 | iter->col = col; 1336 | iter->reversed = reversed; 1337 | iter->position = start; 1338 | iter->step = step; 1339 | iter->stop = stop; 1340 | iter->lazy = self; 1341 | Py_INCREF(self); 1342 | 1343 | return (PyObject*)iter; 1344 | } 1345 | 1346 | if (col_is_slice && !row_is_slice) { 1347 | PySliceObject* col_slice = (PySliceObject*)col_obj; 1348 | 1349 | Py_ssize_t _row = PyLong_AsSsize_t(row_obj); 1350 | size_t row = _row < 0 ? lazy->rows + _row : (size_t)_row; 1351 | 1352 | int row_in_bounds = ( 1353 | 0 <= row && row < lazy->rows 1354 | ); 1355 | 1356 | if (!row_in_bounds) goto boundary_err; 1357 | 1358 | Py_ssize_t _start = col_slice->start == Py_None 1359 | ? (Py_ssize_t)0 1360 | : PyLong_AsSsize_t(col_slice->start); 1361 | Py_ssize_t _stop = col_slice->stop == Py_None 1362 | ? (Py_ssize_t)lazy->cols 1363 | : PyLong_AsSsize_t(col_slice->stop); 1364 | Py_ssize_t _step = 1365 | col_slice->step == Py_None ? 1 : PyLong_AsSsize_t(col_slice->step); 1366 | 1367 | size_t start = _start < 0 ? lazy->cols + _start : (size_t)_start; 1368 | size_t stop = _stop < 0 ? lazy->cols + _stop : (size_t)_stop; 1369 | 1370 | size_t step; 1371 | char reversed = 0; 1372 | 1373 | if (_step < 0) { 1374 | step = (size_t)(-1 * _step); 1375 | reversed = 1; 1376 | if (col_slice->start != Py_None) { 1377 | start = lazy->cols - start - 1; 1378 | } 1379 | if (col_slice->stop != Py_None) { 1380 | stop = lazy->cols - stop - 1; 1381 | } 1382 | } 1383 | else { 1384 | step = (size_t)_step; 1385 | } 1386 | 1387 | PyTypeObject* type = &LazyCSV_IterType; 1388 | LazyCSV_Iter* iter = (LazyCSV_Iter*)type->tp_alloc(type, 0); 1389 | if (!iter) goto memory_err; 1390 | 1391 | iter->row = row; 1392 | iter->col = SIZE_MAX; 1393 | iter->reversed = reversed; 1394 | iter->position = start; 1395 | iter->step = step; 1396 | iter->stop = stop; 1397 | iter->lazy = self; 1398 | Py_INCREF(self); 1399 | 1400 | return (PyObject*)iter; 1401 | } 1402 | 1403 | goto schema_err; 1404 | 1405 | schema_err: 1406 | PyErr_SetString( 1407 | PyExc_ValueError, 1408 | "given indexing schema is not supported" 1409 | ); 1410 | return NULL; 1411 | 1412 | memory_err: 1413 | PyErr_SetString( 1414 | PyExc_MemoryError, 1415 | "unable to allocate memory for iterable" 1416 | ); 1417 | return NULL; 1418 | 1419 | boundary_err: 1420 | PyErr_SetString( 1421 | PyExc_ValueError, 1422 | "provided value not in bounds of index" 1423 | ); 1424 | return NULL; 1425 | } 1426 | 1427 | 1428 | static PyMemberDef LazyCSV_Members[] = { 1429 | {"headers", T_OBJECT, offsetof(LazyCSV, headers), READONLY, "header tuple"}, 1430 | {"rows", T_LONG, offsetof(LazyCSV, rows), READONLY, "row length"}, 1431 | {"cols", T_LONG, offsetof(LazyCSV, cols), READONLY, "col length"}, 1432 | {"name", T_OBJECT, offsetof(LazyCSV, name), READONLY, "file name"}, 1433 | {NULL, } 1434 | }; 1435 | 1436 | 1437 | static PyMethodDef LazyCSV_Methods[] = { 1438 | { 1439 | "sequence", 1440 | (PyCFunction)LazyCSV_Seq, 1441 | METH_VARARGS|METH_KEYWORDS, 1442 | "get column iterator" 1443 | }, 1444 | {NULL, } 1445 | }; 1446 | 1447 | static PyMappingMethods LazyCSV_MappingMembers[] = { 1448 | (lenfunc)NULL, 1449 | (binaryfunc)LazyCSV_GetItem, 1450 | (objobjargproc)NULL, 1451 | }; 1452 | 1453 | PyDoc_STRVAR( 1454 | LazyCSV_Docstring, 1455 | "lazycsv.LazyCSV(\n" 1456 | " filepath,\n" 1457 | " /\n" 1458 | " delimiter: str=',',\n" 1459 | " quotechar: str='\"',\n" 1460 | " unquote: bool=True,\n" 1461 | " skip_headers: bool=False,\n" 1462 | " buffer_size: int=2**21,\n" 1463 | " index_dir: str=None,\n" 1464 | ")\n" 1465 | "\n" 1466 | "LazyCSV object constructor. Takes the filepath of a CSV\n" 1467 | "file as the first argument, and several keyword arguments\n" 1468 | "as optional values. Indexes the CSV, generates headers,\n" 1469 | "and returns `self` to the caller." 1470 | "\n\n" 1471 | "Options\n" 1472 | "-------\n" 1473 | "delimiter: ',' -- character used to demarcate the separation\n" 1474 | " of two fields. Should only be a single char.\n" 1475 | "quotechar: '\"' -- character used to ensure contents belong\n" 1476 | " to a single field. Should only be a single char.\n" 1477 | "unquote: bool=True -- if True, a quoted field will be\n" 1478 | " stripped of quotes on parsing. i.e. `,\"goo\\nbar\",`\n" 1479 | " will return 'goo\\nbar'.\n" 1480 | "skip_headers: bool=False -- skips parsing out header\n" 1481 | " values to the .header attribute if True.\n" 1482 | "buffer_size: int=2**21 -- is the buffer size that LazyCSV\n" 1483 | " uses when writing index data to disk during object\n" 1484 | " construction, can be set to any value greater than 0\n" 1485 | " (units of bytes).\n" 1486 | "index_dir: str=None -- Directory where index files\n" 1487 | " are saved. By default uses Python's `TemporaryDirectory()`\n" 1488 | " function in the `tempfile` module.\n" 1489 | "\n" 1490 | "Returns\n" 1491 | "-------\n" 1492 | "self\n" 1493 | 1494 | ); 1495 | 1496 | 1497 | static PyTypeObject LazyCSVType = { 1498 | PyVarObject_HEAD_INIT(NULL, 0) 1499 | .tp_name = "lazycsv.LazyCSV", 1500 | .tp_doc = LazyCSV_Docstring, 1501 | .tp_basicsize = sizeof(LazyCSV), 1502 | .tp_dealloc = (destructor)LazyCSV_Destruct, 1503 | .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, 1504 | .tp_methods = LazyCSV_Methods, 1505 | .tp_members = LazyCSV_Members, 1506 | .tp_as_mapping = LazyCSV_MappingMembers, 1507 | .tp_new = LazyCSV_New, 1508 | }; 1509 | 1510 | 1511 | static PyModuleDef LazyCSVModule = { 1512 | PyModuleDef_HEAD_INIT, 1513 | "lazycsv", 1514 | "module for custom lazycsv object", 1515 | -1, 1516 | NULL 1517 | }; 1518 | 1519 | 1520 | PyMODINIT_FUNC PyInit_lazycsv() { 1521 | #if INCLUDE_NUMPY 1522 | import_array(); 1523 | #endif 1524 | if (PyType_Ready(&LazyCSVType) < 0) 1525 | return NULL; 1526 | 1527 | if (PyType_Ready(&LazyCSV_IterType) < 0) 1528 | return NULL; 1529 | 1530 | PyObject* module = PyModule_Create(&LazyCSVModule); 1531 | if (!module) 1532 | return NULL; 1533 | 1534 | Py_INCREF(&LazyCSVType); 1535 | PyModule_AddObject(module, "LazyCSV", (PyObject*)&LazyCSVType); 1536 | return module; 1537 | } 1538 | 1539 | -------------------------------------------------------------------------------- /tests/benchmark_lazy.py: -------------------------------------------------------------------------------- 1 | import random 2 | import itertools 3 | import os.path 4 | import tempfile 5 | 6 | from time import perf_counter 7 | 8 | 9 | def get_size(file_path, unit="bytes"): 10 | file_size = os.path.getsize(file_path) 11 | exponents_map = {"bytes": 0, "kb": 1, "mb": 2, "gb": 3} 12 | if unit not in exponents_map: 13 | raise ValueError("Must select from ['bytes', 'kb', 'mb', 'gb']") 14 | else: 15 | size = file_size / 1024 ** exponents_map[unit] 16 | return round(size, 3) 17 | 18 | 19 | def run_lazy(fpath): 20 | from lazycsv import lazycsv 21 | print("indexing lazy... ", end="\r") 22 | ti = perf_counter() 23 | lazy = lazycsv.LazyCSV(fpath) 24 | te = perf_counter() 25 | print(f"indexing lazy... time to index: {te-ti}") 26 | for c in range(lazy.cols): 27 | col = list(lazy.sequence(col=c)) 28 | if c % 100 == 0: 29 | print(f"parsing cols... {c}/{lazy.cols}", end="\r") 30 | del col 31 | tf = perf_counter() 32 | print(f"parsing cols... time to parse: {tf-te}", end="\r") 33 | del lazy 34 | print(f"\ntotal time: {tf-ti}") 35 | 36 | 37 | def run_sqlite(fpath): 38 | import sqlite3, csv 39 | tempdir = tempfile.TemporaryDirectory() 40 | print("creating database... ", end="\r") 41 | ti = perf_counter() 42 | conn = sqlite3.connect(os.path.join(tempdir.name, "data.db")) 43 | cur = conn.cursor() 44 | headers, sql, chunks = [], "", [] 45 | with open(fpath, 'r') as f: 46 | reader = csv.reader(f) 47 | for r in reader: 48 | if not headers: 49 | headers = ', '.join(r) 50 | cur.execute(f"CREATE TABLE t ({headers});") 51 | conn.commit() 52 | sql = f"INSERT INTO t ({headers}) VALUES ({', '.join('?'*len(r))});" 53 | headers = r 54 | chunks.append(r) 55 | if len(chunks) > 10000: 56 | cur.executemany(sql, chunks) 57 | conn.commit() 58 | chunks.clear() 59 | if chunks: 60 | cur.executemany(sql, chunks) 61 | conn.commit() 62 | te = perf_counter() 63 | print(f"creating database... time to db: {te-ti}") 64 | for i, c in enumerate(headers): 65 | sql = f"SELECT ({c}) from t;" 66 | # col = tuple(cur.execute(sql)) 67 | col = list(itertools.chain(*cur.execute(sql))) 68 | if i % 100 == 0: 69 | print(f"parsing cols... {i}/{len(headers)}", end="\r") 70 | del col 71 | tf = perf_counter() 72 | print(f"parsing cols... time to parse: {tf-te}") 73 | cur.close() 74 | conn.close() 75 | print(f"\ntotal time: {tf-ti}") 76 | 77 | 78 | def run_datatable(fpath): 79 | import datatable as dt 80 | print("creating datatables frame...", end="\r") 81 | ti = perf_counter() 82 | frame = dt.fread(fpath) 83 | te = perf_counter() 84 | print(f"creating datatables frame... time to object: {te-ti}") 85 | for c in range(frame.ncols): 86 | col = frame[c].to_list() 87 | if c % 100 == 0: 88 | print(f"parsing cols... {c}/{frame.ncols}", end="\r") 89 | del col 90 | tf = perf_counter() 91 | print(f"parsing cols... time to parse: {tf-te}", end="\r") 92 | del frame 93 | print(f"\ntotal time: {tf-ti}") 94 | 95 | 96 | def run_pandas(fpath): 97 | import pandas as pd 98 | print("creating pandas dataframe...", end="\r") 99 | ti = perf_counter() 100 | df = pd.read_csv(fpath) 101 | te = perf_counter() 102 | print(f"creating pandas dataframe... time to object: {te-ti}") 103 | for i, c in enumerate(df.columns): 104 | col = df[c] 105 | if i % 100 == 0: 106 | print(f"parsing col: {c}", end="\r") 107 | del col 108 | te = perf_counter() 109 | del df 110 | print(f"\ntotal time: {te-ti}") 111 | 112 | 113 | def run_pyarrow(fpath): 114 | from pyarrow import csv as pa_csv 115 | print("creating pyarrow table...", end="\r") 116 | ti = perf_counter() 117 | table = pa_csv.read_csv(fpath) 118 | te = perf_counter() 119 | print(f"creating pyarrow table... time to object: {te-ti}") 120 | for c in range(table.num_columns): 121 | col = table[c].to_pylist() 122 | if c % 100 == 0: 123 | print(f"parsing cols... {c}/{table.num_columns}", end="\r") 124 | del col 125 | tf = perf_counter() 126 | print(f"parsing cols... time to parse: {tf-te}", end="\r") 127 | del table 128 | print(f"\ntotal time: {tf-ti}") 129 | 130 | 131 | def run_polars_read(fpath): 132 | import polars as pl 133 | print("creating polars df...", end="\r") 134 | ti = perf_counter() 135 | table = pl.read_csv(fpath) 136 | te = perf_counter() 137 | print(f"creating polars df... time to object: {te-ti}") 138 | for c in range(table.shape[1]): 139 | col = table[:, c].to_list() 140 | if c % 100 == 0: 141 | print(f"parsing cols... {c}/{table.shape[1]}", end="\r") 142 | del col 143 | tf = perf_counter() 144 | print(f"parsing cols... time to parse: {tf-te}", end="\r") 145 | del table 146 | print(f"\ntotal time: {tf-ti}") 147 | 148 | 149 | def run_polars_scan(fpath): 150 | import polars as pl 151 | print("creating polars df...", end="\r") 152 | ti = perf_counter() 153 | table = pl.scan_csv(fpath, rechunk=False) 154 | te = perf_counter() 155 | print(f"creating polars df... time to object: {te-ti}") 156 | cols = len(table.columns) 157 | for i, c in enumerate(table.columns): 158 | col = tuple( 159 | table 160 | .select(c) 161 | .collect() 162 | .to_dict(as_series=False) 163 | .values() 164 | ) 165 | if i % 100 == 0: 166 | print(f"parsing cols... {i}/{cols}", end="\r") 167 | del col 168 | tf = perf_counter() 169 | print(f"parsing cols... time to parse: {tf-te}", end="\r") 170 | del table 171 | print(f"\ntotal time: {tf-ti}") 172 | 173 | 174 | def main(): 175 | cols = 5000 176 | rows = 50000 177 | sparsity = 0.50 178 | benchmarks = { 179 | "lazycsv": run_lazy, 180 | # "pandas": run_pandas, 181 | # "pyarrow": run_pyarrow, 182 | # "datatable": run_datatable, 183 | # "polars (read)": run_polars_read, 184 | # "polars (scan)": run_polars_scan, 185 | # "sqlite": run_sqlite 186 | } 187 | filename = f"benchmark_{rows}r_{cols}c_{int(sparsity*100)}%.csv" 188 | HERE = os.path.abspath(os.path.dirname(__file__)) 189 | dir = os.path.join(HERE, f"fixtures/benchmarks") 190 | os.makedirs(dir, exist_ok=True) 191 | filepath = os.path.join(dir, filename) 192 | if os.path.isfile(filepath): 193 | name = filepath 194 | tempf = None 195 | else: 196 | tempf = tempfile.NamedTemporaryFile() 197 | headers = ",".join(f"col_{i}" for i in range(cols)) + "\n" 198 | tempf.write(headers.encode("utf8")) 199 | i = 0 200 | for i in range(rows): 201 | row = ",".join( 202 | f"{i}x{j}" if random.random() > sparsity else "" for j in range(cols) 203 | ) 204 | tempf.write((row + "\n").encode("utf8")) 205 | del row 206 | if i % 100 == 0: 207 | print(f"writing rows: {i}/{rows}", end="\r") 208 | print(f"writing rows: {i}/{rows}") 209 | tempf.flush() 210 | name = tempf.name 211 | if input("copy to benchmarks? [Y/n]: ") in {"Y", ""}: 212 | __import__("shutil").copyfile(tempf.name, filepath) 213 | path = os.path.abspath(name) 214 | print(f"filesize: {get_size(name, 'gb')}gb") 215 | print(f"{cols=}") 216 | print(f"{rows=}") 217 | print(f"{sparsity=}") 218 | for name, fn in benchmarks.items(): 219 | print(f"\nbenchmarking {name}:") 220 | fn(path) 221 | if tempf: 222 | tempf.close() 223 | 224 | 225 | if __name__ == "__main__": 226 | main() 227 | 228 | -------------------------------------------------------------------------------- /tests/fixtures/file.csv: -------------------------------------------------------------------------------- 1 | ,ALPHA,BETA 2 | 0,a0,b0 3 | 1,a1,b1 4 | -------------------------------------------------------------------------------- /tests/fixtures/file_crlf.csv: -------------------------------------------------------------------------------- 1 | ,A,B 2 | 0,a0,b0 3 | -------------------------------------------------------------------------------- /tests/fixtures/file_crlf2.csv: -------------------------------------------------------------------------------- 1 | ,"This,that","Fizz,Buzz" 2 | 0,"Goo,Bar","Bizz,Bazz" -------------------------------------------------------------------------------- /tests/fixtures/file_delimiter_and_quotechar.csv: -------------------------------------------------------------------------------- 1 | INDEX ATTR 2 | 0 |A| 3 | 1 |B| 4 | -------------------------------------------------------------------------------- /tests/fixtures/file_empty.csv: -------------------------------------------------------------------------------- 1 | ,ALPHA,BETA 2 | ,, 3 | ,, 4 | -------------------------------------------------------------------------------- /tests/fixtures/file_newline.csv: -------------------------------------------------------------------------------- 1 | ,"This,that 2 | ","Fizz,Buzz " 3 | 0,"Goo,Bar 4 | ","Bizz,Bazz" 5 | -------------------------------------------------------------------------------- /tests/script_lazycsv.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from lazycsv import lazycsv 4 | 5 | file = sys.argv[1] 6 | 7 | lazy = lazycsv.LazyCSV(file) 8 | 9 | data = [ 10 | list(lazy[:, i]) 11 | for i in range(lazy.cols) 12 | ] 13 | 14 | -------------------------------------------------------------------------------- /tests/test_lazycsv.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import csv 3 | import os 4 | import os.path 5 | import tempfile 6 | import textwrap 7 | 8 | from lazycsv import lazycsv 9 | 10 | import numpy as np 11 | import pytest 12 | 13 | HERE = os.path.abspath(os.path.dirname(__file__)) 14 | FPATH = os.path.join(HERE, "fixtures/file.csv") 15 | 16 | INDEX_COLLECTION = [None, *range(-9, 0), *range(1, 10)] 17 | 18 | SLICE_INDEXES = [ 19 | (a, b, c) 20 | for a in INDEX_COLLECTION 21 | for b in INDEX_COLLECTION 22 | for c in INDEX_COLLECTION 23 | ] 24 | 25 | 26 | @pytest.fixture 27 | def lazy(): 28 | lazy = lazycsv.LazyCSV(FPATH) 29 | yield lazy 30 | 31 | 32 | @pytest.fixture 33 | def file_1000r_1000c(): 34 | tempf = tempfile.NamedTemporaryFile() 35 | cols, rows = 1000, 1000 36 | headers = ",".join("col_{i}".format_map(dict(i=i)) for i in range(cols)) + "\n" 37 | tempf.write(headers.encode("utf8")) 38 | for _ in range(rows): 39 | row = ",".join("{j}".format_map(dict(j=j)) for j in range(cols)) + "\n" 40 | tempf.write(row.encode("utf8")) 41 | tempf.flush() 42 | yield tempf 43 | tempf.close() 44 | 45 | 46 | @contextlib.contextmanager 47 | def prepped_file(actual): 48 | tempf = tempfile.NamedTemporaryFile() 49 | tempf.write(actual) 50 | tempf.flush() 51 | yield tempf 52 | tempf.close() 53 | 54 | 55 | def test_demo(): 56 | actual = b"INDEX,A,B\n0,,2\n,,5" 57 | with prepped_file(actual) as tempf: 58 | lazy = lazycsv.LazyCSV(tempf.name) 59 | data = tuple(tuple(lazy.sequence(col=c)) for c in range(lazy.cols)) 60 | assert data == ((b"0", b""), (b"", b""), (b"2", b"5")) 61 | 62 | 63 | class TestLazyCSV: 64 | def test_attributes(self): 65 | lazy = lazycsv.LazyCSV(b"../tests/fixtures/file.csv") 66 | assert lazy.name == os.path.abspath(FPATH).encode() 67 | assert lazy.headers == (b"", b"ALPHA", b"BETA") 68 | 69 | def test_bad_file_arg(self): 70 | with pytest.raises(ValueError) as err: 71 | _ = lazycsv.LazyCSV(1) 72 | (_str,) = err.value.args 73 | assert _str == "first argument must be str or bytes" 74 | 75 | def test_more_headers(self): 76 | actual = b"INDEX,,AA,B,CC,D,EE\n0,1,2,3,4,5,6\n" 77 | with prepped_file(actual) as tempf: 78 | lazy = lazycsv.LazyCSV(tempf.name) 79 | assert lazy.headers == (b"INDEX", b"", b"AA", b"B", b"CC", b"D", b"EE") 80 | 81 | def test_headers_empty_index(self): 82 | actual = b",AA,B,CC,D,EE\n0,1,2,3,4,\n" 83 | with prepped_file(actual) as tempf: 84 | lazy = lazycsv.LazyCSV(tempf.name) 85 | assert lazy.headers == (b"", b"AA", b"B", b"CC", b"D", b"EE") 86 | 87 | def test_initial_parse(self, lazy): 88 | assert lazy.rows, lazy.cols == (2, 3) 89 | 90 | def test_initial_parse_skip_headers(self): 91 | lazy = lazycsv.LazyCSV(FPATH, skip_headers=True) 92 | assert lazy.rows, lazy.cols == (3, 3) 93 | assert lazy.headers == () 94 | 95 | def test_get_column(self, lazy): 96 | actual = list(lazy.sequence(col=0)) 97 | assert actual == [b"0", b"1"] 98 | actual = list(lazy.sequence(col=1)) 99 | assert actual == [b"a0", b"a1"] 100 | actual = list(lazy.sequence(col=2)) 101 | assert actual == [b"b0", b"b1"] 102 | 103 | def test_get_column_slice(self, lazy): 104 | actual = list(lazy[:, 1]) 105 | assert actual == [b"a0", b"a1"] 106 | with pytest.raises(ValueError) as err: 107 | _ = list(lazy[:, -5]) 108 | assert err.value.args == ("provided value not in bounds of index",) 109 | 110 | def test_get_col_slice_variety(self, lazy): 111 | actual = b"INDEX\n0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n" 112 | with prepped_file(actual) as tempf: 113 | lazy = lazycsv.LazyCSV(tempf.name) 114 | for indexes in SLICE_INDEXES: 115 | _slice = slice(*indexes) 116 | expected = list(range(10))[_slice] 117 | actual = list(map(int, lazy[_slice, 0])) 118 | assert actual == expected 119 | 120 | def test_get_actual_col(self): 121 | actual = b"INDEX,ATTR\n0,a\n1,b\n2,c\n3,d\n" 122 | with prepped_file(actual) as tempf: 123 | lazy = lazycsv.LazyCSV(tempf.name) 124 | assert list(lazy.sequence(col=0)) == [b"0", b"1", b"2", b"3"] 125 | assert list(lazy.sequence(col=1)) == [b"a", b"b", b"c", b"d"] 126 | assert lazy.headers == (b"INDEX", b"ATTR") 127 | assert lazy.rows, lazy.cols == (4, 2) 128 | 129 | def test_get_actual_col_skip_headers(self): 130 | actual = b"INDEX,ATTR\n0,a\n1,b\n2,c\n3,d\n" 131 | with prepped_file(actual) as tempf: 132 | lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True) 133 | assert list(lazy.sequence(col=0)) == [b"INDEX", b"0", b"1", b"2", b"3"] 134 | assert list(lazy.sequence(col=1)) == [b"ATTR", b"a", b"b", b"c", b"d"] 135 | assert lazy.headers == () 136 | assert lazy.rows, lazy.cols == (4, 2) 137 | 138 | def test_headless_actual_col(self): 139 | actual = b"INDEX,ATTR\n0,a\n1,b\n" 140 | with prepped_file(actual) as tempf: 141 | lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True) 142 | actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols)) 143 | 144 | assert lazy.rows, lazy.cols == (3, 2) 145 | assert actual == [[b"INDEX", b"0", b"1"], [b"ATTR", b"a", b"b"]] 146 | assert lazy.headers == () 147 | 148 | def test_get_row(self, lazy): 149 | row_0 = list(lazy.sequence(row=0)) 150 | assert row_0 == [b"0", b"a0", b"b0"] 151 | row_1 = list(lazy.sequence(row=1)) 152 | assert row_1 == [b"1", b"a1", b"b1"] 153 | 154 | def test_get_row_getitem(self, lazy): 155 | row_0 = list(lazy[0, :]) 156 | assert row_0 == [b"0", b"a0", b"b0"] 157 | with pytest.raises(ValueError) as err: 158 | _ = list(lazy[-5, :]) 159 | assert err.value.args == ("provided value not in bounds of index",) 160 | 161 | def test_get_row_slice_variety(self): 162 | actual = b"A,B,C,D,E,F,G,H,I,J\n0,1,2,3,4,5,6,7,8,9\n" 163 | with prepped_file(actual) as tempf: 164 | lazy = lazycsv.LazyCSV(tempf.name) 165 | for indexes in SLICE_INDEXES: 166 | _slice = slice(*indexes) 167 | expected = list(range(10))[_slice] 168 | actual = list(map(int, lazy[0, _slice])) 169 | assert actual == expected 170 | 171 | def test_get_row_slice_skipped_headers(self): 172 | actual = b"A,B,C,D,E,F,G,H,I,J\n0,1,2,3,4,5,6,7,8,9\n" 173 | with prepped_file(actual) as tempf: 174 | lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True) 175 | for indexes in SLICE_INDEXES: 176 | _slice = slice(*indexes) 177 | expected = list(range(10))[_slice] 178 | actual = list(map(int, lazy[1, _slice])) 179 | assert actual == expected 180 | 181 | def test_empty_csv(self): 182 | lazy = lazycsv.LazyCSV("fixtures/file_empty.csv") 183 | actual = [list(lazy.sequence(col=i)) for i in range(lazy.cols)] 184 | assert actual == [[b"", b""], [b"", b""], [b"", b""]] 185 | 186 | def test_headless_empty_csv(self): 187 | actual = b",\n,\n,\n" 188 | with prepped_file(actual) as tempf: 189 | lazy = lazycsv.LazyCSV(tempf.name) 190 | col1 = list(lazy.sequence(col=1)) 191 | col0 = list(lazy.sequence(col=0)) 192 | 193 | actual = [col0, col1] 194 | assert actual == [[b"", b""], [b"", b""]] 195 | assert lazy.rows, lazy.cols == (2, 3) 196 | 197 | def test_empty_skipped_headers_csv(self): 198 | actual = b",\n,\n,\n" 199 | with prepped_file(actual) as tempf: 200 | lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True) 201 | actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols)) 202 | assert lazy.rows, lazy.cols == (3, 2) 203 | assert actual == [[b"", b"", b""], [b"", b"", b""]] 204 | assert lazy.headers == () 205 | 206 | def test_getitem(self, lazy): 207 | data = b",,\n0x0,0x1,0x2\n1x0,1x1,1x2\n2x0,2x1,2x2\n" 208 | with prepped_file(data) as tempf: 209 | lazy = lazycsv.LazyCSV(tempf.name) 210 | assert lazy[0, 0] == lazy[-3, -3] == b"0x0" 211 | assert lazy[1, 1] == lazy[-2, -2] == b"1x1" 212 | assert lazy[2, 2] == lazy[-1, -1] == b"2x2" 213 | with pytest.raises(ValueError) as err: 214 | lazy[3, 3] 215 | assert ("provided value not in bounds of index",) == err.value.args 216 | 217 | def test_getitem_empty(self, lazy): 218 | data = b",,\n0x0,0x1,0x2\n1x0,,1x2\n2x0,2x1,2x2\n" 219 | with prepped_file(data) as tempf: 220 | lazy = lazycsv.LazyCSV(tempf.name) 221 | assert lazy[1, 1] == b"" 222 | 223 | def test_getitem_skipped_headers(self): 224 | data = b"0x0,0x1,0x2\n1x0,1x1,1x2\n2x0,2x1,2x2\n" 225 | with prepped_file(data) as tempf: 226 | lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True) 227 | assert lazy[0, 0] == lazy[-3, -3] == b"0x0" 228 | assert lazy[1, 1] == lazy[-2, -2] == b"1x1" 229 | assert lazy[2, 2] == lazy[-1, -1] == b"2x2" 230 | with pytest.raises(ValueError) as err: 231 | lazy[3, 3] 232 | assert ("provided value not in bounds of index",) == err.value.args 233 | 234 | 235 | class TestLazyCSVIter: 236 | def test_to_list(self, lazy): 237 | assert lazy.sequence(col=0).to_list() == [b"0", b"1"] 238 | assert lazy.sequence(row=1).to_list() == [b'1', b'a1', b'b1'] 239 | 240 | def test_to_numpy(self): 241 | actual = b"INDEX,ATTR\n0,a\n10,b\n100,c\n1000,d\n" 242 | with prepped_file(actual) as tempf: 243 | lazy = lazycsv.LazyCSV(tempf.name) 244 | _iter = lazy.sequence(col=0) 245 | if hasattr(_iter, "to_numpy"): 246 | arr = _iter.to_numpy() 247 | assert arr.tolist() == [b"0", b"10", b"100", b"1000"] 248 | assert (lazy 249 | .sequence(row=1) 250 | .to_numpy() 251 | .tolist() 252 | ) == [b'10', b'b'] 253 | else: 254 | raise RuntimeError( 255 | "test suite did not test numpy, recompile while setting" 256 | " LAZYCSV_INCLUDE_NUMPY=1 as an env variable to compile" 257 | " extension with numpy support." 258 | ) 259 | 260 | 261 | class TestLazyCSVOptions: 262 | def test_custom_quotechar_and_delimiter(self): 263 | lazy = lazycsv.LazyCSV("fixtures/file_delimiter_and_quotechar.csv", quotechar="|", delimiter="\t") 264 | actual = list(list(lazy.sequence(row=i)) for i in range(lazy.rows)) 265 | expected = [[b'0', b'A'], [b'1', b'B']] 266 | assert actual == expected 267 | 268 | def test_custom_quotechar_unquote_false(self): 269 | data = "INDEX,ATTR\n0,|A|\n1,|B|\n" 270 | with prepped_file(data.encode()) as tempf: 271 | lazy = lazycsv.LazyCSV(tempf.name, unquote=False, quotechar="|") 272 | actual = list(list(lazy.sequence(row=i)) for i in range(lazy.rows)) 273 | expected = [[b'0', b'|A|'], [b'1', b'|B|']] 274 | assert actual == expected 275 | 276 | def test_get_skipped_header_column(self): 277 | lazy = lazycsv.LazyCSV(FPATH, skip_headers=True) 278 | actual = list(lazy.sequence(col=0)) 279 | assert actual == [b"", b"0", b"1"] 280 | actual = list(lazy.sequence(col=1)) 281 | assert actual == [b"ALPHA", b"a0", b"a1"] 282 | actual = list(lazy.sequence(col=2)) 283 | assert actual == [b"BETA", b"b0", b"b1"] 284 | 285 | def test_get_skip_headers_row(self): 286 | lazy = lazycsv.LazyCSV(FPATH, skip_headers=True) 287 | row_0 = list(lazy.sequence(row=0)) 288 | assert row_0 == [b"", b"ALPHA", b"BETA"] 289 | row_1 = list(lazy.sequence(row=1)) 290 | assert row_1 == [b"0", b"a0", b"b0"] 291 | row_2 = list(lazy.sequence(row=2)) 292 | assert row_2 == [b"1", b"a1", b"b1"] 293 | 294 | def test_skipped_headers_full_row(self): 295 | actual = b"this,that\n,\n,\n" 296 | with prepped_file(actual) as tempf: 297 | lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True) 298 | actual = list(list(lazy.sequence(row=i)) for i in range(lazy.rows)) 299 | assert lazy.headers == () 300 | header, *rest = actual 301 | assert header == [b"this", b"that"] 302 | assert rest == [[b"", b""], [b"", b""]] 303 | 304 | def test_get_skip_headers_row_reversed(self): 305 | lazy = lazycsv.LazyCSV(FPATH, skip_headers=True) 306 | row_0 = list(lazy.sequence(row=0, reversed=True)) 307 | assert row_0 == [b"BETA", b"ALPHA", b""] 308 | row_1 = list(lazy.sequence(row=1, reversed=True)) 309 | assert row_1 == [b"b0", b"a0", b"0"] 310 | row_2 = list(lazy.sequence(row=2, reversed=True)) 311 | assert row_2 == [b"b1", b"a1", b"1"] 312 | 313 | def test_get_reversed_column(self, lazy): 314 | actual = list(lazy.sequence(col=0, reversed=True)) 315 | assert actual == [b"1", b"0"] 316 | actual = list(lazy.sequence(col=1, reversed=True)) 317 | assert actual == [b"a1", b"a0"] 318 | actual = list(lazy.sequence(col=2, reversed=True)) 319 | assert actual == [b"b1", b"b0"] 320 | 321 | def test_get_skipped_header_column_reversed(self): 322 | lazy = lazycsv.LazyCSV(FPATH, skip_headers=True) 323 | actual = [list(lazy.sequence(col=i, reversed=True)) for i in range(lazy.cols)] 324 | expected = [ 325 | [b"1", b"0", b""], 326 | [b"a1", b"a0", b"ALPHA"], 327 | [b"b1", b"b0", b"BETA"], 328 | ] 329 | assert actual == expected 330 | 331 | def test_get_reversed_row(self, lazy): 332 | row_0 = list(lazy.sequence(row=0, reversed=True)) 333 | assert row_0 == [b"b0", b"a0", b"0"] 334 | row_1 = list(lazy.sequence(row=1, reversed=True)) 335 | assert row_1 == [b"b1", b"a1", b"1"] 336 | 337 | def test_newlines_in_quote(self): 338 | lazy = lazycsv.LazyCSV("fixtures/file_newline.csv", unquote=False) 339 | assert lazy.headers == (b"", b'"This,that\n"', b'"Fizz,Buzz\r"') 340 | actual = [list(lazy.sequence(col=i)) for i in range(lazy.cols)] 341 | assert actual == [[b"0"], [b'"Goo,Bar\n"'], [b'"Bizz,Bazz"']] 342 | 343 | def test_buffer_size(self): 344 | lazy = lazycsv.LazyCSV(FPATH, buffer_size=1024) 345 | actual = list(lazy.sequence(col=0)) 346 | assert actual == [b"0", b"1"] 347 | 348 | def test_negative_buffer_size(self): 349 | with pytest.raises(ValueError) as e: 350 | lazycsv.LazyCSV(FPATH, buffer_size=-1) 351 | assert e.type == ValueError 352 | 353 | def test_dirname(self): 354 | tempdir = tempfile.TemporaryDirectory() 355 | _ = lazycsv.LazyCSV(FPATH, index_dir=tempdir.name) 356 | assert len(os.listdir(tempdir.name)) == 3 357 | 358 | 359 | class TestCRLF: 360 | def test_crlf1(self): 361 | lazy = lazycsv.LazyCSV("fixtures/file_crlf.csv") 362 | 363 | assert lazy.headers == (b"", b"A", b"B") 364 | actual = list(lazy.sequence(col=0)) 365 | assert actual == [b"0"] 366 | actual = list(lazy.sequence(col=1)) 367 | assert actual == [b"a0"] 368 | actual = list(lazy.sequence(col=2)) 369 | assert actual == [b"b0"] 370 | 371 | def test_crlf2(self): 372 | lazy = lazycsv.LazyCSV("fixtures/file_crlf2.csv", unquote=False) 373 | 374 | assert lazy.headers == (b"", b'"This,that"', b'"Fizz,Buzz"') 375 | actual = list(lazy.sequence(col=0)) 376 | assert actual == [b"0"] 377 | actual = list(lazy.sequence(col=1)) 378 | assert actual == [b'"Goo,Bar"'] 379 | actual = list(lazy.sequence(col=2)) 380 | assert actual == [b'"Bizz,Bazz"'] 381 | 382 | 383 | class TestBigFiles: 384 | def test_bigger_file(self, file_1000r_1000c): 385 | lazy = lazycsv.LazyCSV(file_1000r_1000c.name) 386 | actual = list(lazy.sequence(col=0)) 387 | assert len(actual) == 1000 388 | 389 | def test_variable_buffer_size(self, file_1000r_1000c): 390 | lazy = lazycsv.LazyCSV(file_1000r_1000c.name, buffer_size=10**7) 391 | actual = list(lazy.sequence(col=0)) 392 | assert len(actual) == 1000 393 | 394 | def test_big_sparse(self): 395 | tempf = tempfile.NamedTemporaryFile() 396 | cols, rows = 200, 200 397 | headers = ",".join("col_{i}".format_map(dict(i=i)) for i in range(cols)) + "\n" 398 | tempf.write(headers.encode("utf8")) 399 | targets = {249, 499, 749, 999} 400 | for _ in range(rows): 401 | row = ",".join("{j}".format_map(dict(j=j)) if j in targets else "" for j in range(cols)) + "\n" 402 | tempf.write(row.encode("utf8")) 403 | tempf.flush() 404 | 405 | lazy = lazycsv.LazyCSV(tempf.name) 406 | with open(tempf.name) as f: 407 | reader = csv.reader(f) 408 | headers = tuple(x.encode() for x in next(reader)) 409 | data = list(reader) 410 | assert headers == lazy.headers 411 | for val in range(cols): 412 | expected = [i[val].encode() for i in data] 413 | actual = list(lazy.sequence(col=val)) 414 | assert actual == expected 415 | 416 | 417 | class TestUnorderedFiles: 418 | def test_missing_col(self): 419 | data = "x,y,z\r\n1,2\r\n3,1,3\r\n".encode() 420 | with prepped_file(data) as tempf, pytest.warns(RuntimeWarning): 421 | lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True) 422 | actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols)) 423 | expected = [[b"x", b"1", b"3"], [b"y", b"2", b"1"], [b"z", b"", b"3"]] 424 | assert actual == expected 425 | 426 | def test_many_missing_cols(self): 427 | data = "a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z\n\n0,,,,,,,,,,,,,,,,,,,,,,,,,1\n".encode() 428 | with prepped_file(data) as tempf, pytest.warns(RuntimeWarning): 429 | lazy = lazycsv.LazyCSV(tempf.name) 430 | assert list(lazy.sequence(col=0)) == [b"", b"0"] 431 | assert list(lazy.sequence(col=25)) == [b"", b"1"] 432 | 433 | def test_extra_col(self): 434 | data = "x,y\r\n1,2,3\r\n4,5\r\n".encode() 435 | with prepped_file(data) as tempf, pytest.warns(RuntimeWarning): 436 | lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True) 437 | actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols)) 438 | expected = [[b"x", b"1", b"4"], [b"y", b"2", b"5"]] 439 | assert actual == expected 440 | 441 | def test_many_extra_col(self): 442 | data = "x\r\n,,,,,,,,,,,,,,,,,,,,,,,,,,,\r\n4\r\n".encode() 443 | with prepped_file(data) as tempf, pytest.warns(RuntimeWarning): 444 | lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True) 445 | actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols)) 446 | expected = [[b"x", b"", b"4"]] 447 | assert actual == expected 448 | 449 | 450 | class TestEdgecases: 451 | def test_many_files_separators(self): 452 | for sep in ("\n", "\r", "\r\n"): 453 | for i in range(250, 265): 454 | header = "A" * i 455 | data = "{header}{sep}1{sep}2".format_map(dict(header=header, sep=sep)) 456 | with prepped_file(data.encode()) as tempf: 457 | lazy = lazycsv.LazyCSV(tempf.name) 458 | actual = list(lazy.sequence(col=0)) 459 | headers = lazy.headers 460 | assert actual == [b"1", b"2"] 461 | assert headers == (header.encode(),) 462 | 463 | def test_many_empty_files_separators(self): 464 | for sep in ("\n", "\r", "\r\n"): 465 | for i in range(250, 261): 466 | header = "A" * i 467 | data = "{header}{sep}{sep}".format_map(dict(header=header, sep=sep)) 468 | with prepped_file(data.encode()) as tempf: 469 | lazy = lazycsv.LazyCSV(tempf.name) 470 | actual = list(lazy.sequence(col=0)) 471 | headers = lazy.headers 472 | assert actual == [b""] 473 | assert headers == (header.encode(),) 474 | 475 | def test_many_empty_files_separators_many_cols(self): 476 | for sep in ("\n", "\r", "\r\n"): 477 | for item in ("", "0"): 478 | for n in range(250, 261): 479 | header = ",".join(item for _ in range(n)) 480 | data = "" 481 | for _ in range(n): 482 | data += header + sep 483 | with prepped_file(data.encode()) as tempf: 484 | lazy = lazycsv.LazyCSV(tempf.name) 485 | actual = list(lazy.sequence(col=0)) 486 | headers = lazy.headers 487 | assert len(actual) == len(headers) - 1 == n - 1 488 | assert all(i == item.encode() for i in actual) 489 | 490 | def test_sparse_column(self): 491 | data = "HEADER\n\n1\n\n2\n\n\n3\n" 492 | with prepped_file(data.encode()) as tempf: 493 | lazy = lazycsv.LazyCSV(tempf.name) 494 | actual = list(lazy.sequence(col=0)) 495 | headers = lazy.headers 496 | assert headers == (b"HEADER",) 497 | assert actual == [b"", b"1", b"", b"2", b"", b"", b"3"] 498 | 499 | def test_sparse_crlf_column(self): 500 | data = "HEADER\r\n\r\n1\r\n\r\n2\r\n\r\n\r\n3\r\n" 501 | with prepped_file(data.encode()) as tempf: 502 | lazy = lazycsv.LazyCSV(tempf.name) 503 | actual = list(lazy.sequence(col=0)) 504 | headers = lazy.headers 505 | assert headers == (b"HEADER",) 506 | assert actual == [b"", b"1", b"", b"2", b"", b"", b"3"] 507 | 508 | def test_getitem_with_crlf_newline_at_eof(self): 509 | data = 'x,y,z,str,date,quarter,ca_subvar_1,ca_subvar_2,ca_subvar_3,bool1,bool2,bool3\r\n1,2000-01-01T00:00:00,"range(-999.0,0.0)",red,2014-11-01T00:00:00,2014-10-01T00:00:00,1,1,2,1,1,0\r\n2,2000-01-02T00:00:00,No Data,green,2014-11-01T00:00:00,2014-10-01T00:00:00,1,2,3,1,0,0\r\n3,1950-12-24T00:00:00,1.234,reg-green-blue-whatever,2014-12-15T00:00:00,2014-10-01T00:00:00,2,3,4,0,1,0\r\n'.encode() 510 | with prepped_file(data) as tempf: 511 | lazy = lazycsv.LazyCSV(tempf.name) 512 | actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols)) 513 | expected = [ 514 | [b"1", b"2", b"3"], 515 | [b"2000-01-01T00:00:00", b"2000-01-02T00:00:00", b"1950-12-24T00:00:00"], 516 | [b"range(-999.0,0.0)", b"No Data", b"1.234"], 517 | [b"red", b"green", b"reg-green-blue-whatever"], 518 | [b"2014-11-01T00:00:00", b"2014-11-01T00:00:00", b"2014-12-15T00:00:00"], 519 | [b"2014-10-01T00:00:00", b"2014-10-01T00:00:00", b"2014-10-01T00:00:00"], 520 | [b"1", b"1", b"2"], 521 | [b"1", b"2", b"3"], 522 | [b"2", b"3", b"4"], 523 | [b"1", b"1", b"0"], 524 | [b"1", b"0", b"1"], 525 | [b"0", b"0", b"0"], 526 | ] 527 | assert expected == actual 528 | 529 | def test_problematic_numeric(self): 530 | data = """\ 531 | ColName 532 | -9 533 | 0 534 | -179769313486000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 535 | 1.234 536 | 999 537 | 3.14159 538 | -179769313486000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 539 | """ 540 | data = textwrap.dedent(data) 541 | data = data.encode() 542 | with prepped_file(data) as tempf: 543 | lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True) 544 | actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols)) 545 | expected = [data.split()] 546 | assert actual == expected 547 | 548 | def test_encoded_headers(self): 549 | data = '"Göteborg","Sverige",Umeå,Köln,東京,deltaΔdelta\nc1,c2,c3,c4,c5,c6\n' 550 | data = data.encode() 551 | with prepped_file(data) as tempf: 552 | lazy = lazycsv.LazyCSV(tempf.name) 553 | assert lazy.headers == ( 554 | "Göteborg".encode(), 555 | "Sverige".encode(), 556 | "Umeå".encode(), 557 | "Köln".encode(), 558 | "東京".encode(), 559 | "delta\u0394delta".encode(), 560 | ) 561 | actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols)) 562 | expected = [[b"c1"], [b"c2"], [b"c3"], [b"c4"], [b"c5"], [b"c6"]] 563 | assert actual == expected 564 | 565 | def test_crlf_no_newline(self): 566 | actual = "header1,header2,header3\r\n1,2,3\r\n4,5,6\r\n7,8,9".encode() 567 | with prepped_file(actual) as tempf: 568 | lazy = lazycsv.LazyCSV(tempf.name) 569 | actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols)) 570 | expected = [[b"1", b"4", b"7"], [b"2", b"5", b"8"], [b"3", b"6", b"9"]] 571 | assert actual == expected 572 | 573 | def test_empty_headers(self): 574 | data = ( 575 | '" ","","repeated"\n' 576 | '2557," Bagua "," Amazonas"\n' 577 | '2563," Bongara "," Amazonas"\n' 578 | '2535," Chachapoyas "," Amazonas"\n' 579 | '2576," Condorcanqui "," Amazonas"\n' 580 | ) 581 | data = textwrap.dedent(data) 582 | data = data.encode() 583 | with prepped_file(data) as tempf: 584 | lazy = lazycsv.LazyCSV(tempf.name) 585 | assert lazy.headers == (b" ", b"", b"repeated") 586 | actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols)) 587 | expected = [ 588 | [b"2557", b"2563", b"2535", b"2576"], 589 | [b" Bagua ", b" Bongara ", b" Chachapoyas ", b" Condorcanqui "], 590 | [b" Amazonas", b" Amazonas", b" Amazonas", b" Amazonas"], 591 | ] 592 | assert actual == expected 593 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py35,py36,py37,py38,py39,py310,py311 3 | 4 | [testenv] 5 | deps = 6 | pytest 7 | numpy 8 | setenv = 9 | LAZYCSV_INCLUDE_NUMPY=1 10 | commands = 11 | python -m pytest {posargs} 12 | 13 | --------------------------------------------------------------------------------