├── .gitignore
├── LICENSE
├── README.md
├── run
├── setup.py
├── src
    └── lazycsv
    │   ├── __init__.py
    │   └── lazycsv.c
├── tests
    ├── benchmark_lazy.py
    ├── fixtures
    │   ├── file.csv
    │   ├── file_crlf.csv
    │   ├── file_crlf2.csv
    │   ├── file_delimiter_and_quotechar.csv
    │   ├── file_empty.csv
    │   └── file_newline.csv
    ├── script_lazycsv.py
    └── test_lazycsv.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | dist/
 3 | venv/
 4 | .pytest_cache/
 5 | .tox/
 6 | .local/
 7 | __pycache__/
 8 | tests/fixtures/benchmarks/*
 9 | 
10 | *.so
11 | *.egg-info/
12 | *.python-version
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Crunch.io
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # lazycsv - a memory-efficient csv parser
  2 | 
  3 | ###### Developers: Michael Green, Chris Perkins
  4 | 
  5 | lazycsv is a C implementation of a csv parser for python. The aim of this
  6 | parser is to provide for fast extraction of sequences of data from a CSV file
  7 | in a memory-efficient manner, with zero dependencies.
  8 | 
  9 | LazyCSV utilizes memory mapped files and iterators to parse the file without
 10 | persisting any significant amounts of data to physical memory. The design
 11 | allows a user to generate PyObject's from a csv file lazily.
 12 | 
 13 | The parser works as follows:
 14 | 
 15 | First, The user file is memory-mapped internally to the LazyCSV object. That
 16 | file is used to generate three indexes. The first is an index of values which
 17 | correspond to the position in the user file where a given CSV field starts.
 18 | This value is always a `uint16_t` which we found to be the optimal bit size for
 19 | disk usage and execution performance (This type can however be changed by
 20 | setting the `LAZYCSV_INDEX_DTYPE` environment variable to any unsigned integer
 21 | type). For index values outside the range of an unsigned short, An "anchor
 22 | point" is created, which is a pair of `size_t` values that mark both the value
 23 | which is subtracted from the index value such that the index value fits within
 24 | 16 bits, and the first column of the CSV where the anchor value applies. This
 25 | anchor point is periodically written to the second index file when required for
 26 | a given comma index. Finally, the third index writes the index of the first
 27 | anchor point for each row of the file.
 28 | 
 29 | When a user requests a sequence of data (i.e. a row or a column), an iterator
 30 | is created and returned. This iterator uses the value of the requested sequence
 31 | and its internal position state to index into the index files the values
 32 | representing the index of the requested field, and its length. Those two values
 33 | are then used to create a single PyBytes object. These PyBytes objects are then
 34 | yielded to the user per-iteration.
 35 | 
 36 | This process is lazy, only yielding data from the user file as the iterator is
 37 | consumed. It does not cache results as they are generated - it is the
 38 | responsibility of the user to store in physical memory the data which must be
 39 | persisted. The only persisted overhead in physical memory is the LazyCSV object
 40 | itself, any created iterators, a small cache of common length-0 and length-1
 41 | `PyObject*`'s for fast returns, and optionally the headers of the CSV file.
 42 | 
 43 | ```python
 44 | >>> from lazycsv import lazycsv
 45 | >>> lazy = lazycsv.LazyCSV("tests/fixtures/file.csv")
 46 | >>> lazy
 47 | <lazycsv.LazyCSV object at 0x7f5b212ea3d0>
 48 | >>> (col := lazy.sequence(col=0))
 49 | <lazycsv_iterator object at 0x7f5b212ea420>
 50 | >>> next(col)
 51 | b'0'
 52 | >>> next(col)
 53 | b'1'
 54 | >>> next(col)
 55 | Traceback (most recent call last):
 56 |   File "<stdin>", line 1, in <module>
 57 | StopIteration
 58 | ```
 59 | 
 60 | Since data is yielded through the iterator protocol, lazycsv pairs well with
 61 | many of the builtin functional components of Python, and third-party libraries
 62 | with support for iterators. This has the added benefit of keeping iterations in
 63 | the C level, maximizing performance.
 64 | 
 65 | ```python
 66 | >>> row = lazy.sequence(row=1)
 67 | >>> list(map(lambda x: x.decode('utf8'), row))
 68 | ['1', 'a1', 'b1']
 69 | >>>
 70 | >>> import numpy as np
 71 | >>> np.fromiter(map(int, lazy.sequence(col=0)), dtype=np.int64)
 72 | array([0, 1])
 73 | ```
 74 | 
 75 | The `lazy` object also supports indexing operations for expressive iterables.
 76 | The axis for iteration can be passed as a slice object, and the index of the
 77 | iterable can be passed as a integer. Individual coordinate values can also be
 78 | passed as a pair of integers, this call will eagerly return the value at that
 79 | index.
 80 | 
 81 | ```python
 82 | >>> list(lazy[::-1, 1])
 83 | [b'a1', b'a0']
 84 | >>> lazy[-1, -1]
 85 | b"b1"
 86 | ```
 87 | 
 88 | Iterators can be materialized at any point by calling the `to_list()` or
 89 | `to_numpy()` methods on the iterator object (to enable optional numpy support,
 90 | see the Numpy section of this document). These methods exhaust the iterator,
 91 | placing the remaining PyBytes values into a PyObject.
 92 | 
 93 | ```python
 94 | >>> col = lazy[:, 0]
 95 | >>> next(col)
 96 | b'0'
 97 | >>> col.to_list()
 98 | [b'1']
 99 | >>>
100 | ```
101 | 
102 | Headers are by default parsed from the csv file and packaged into a tuple under
103 | a `.headers` attribute. This can be skipped by passing `skip_headers=True` to
104 | the object constructor. Skipping the header parsing step results in the header
105 | value being included in the iterator.
106 | 
107 | *Note: `lazycsv` makes no effort to deduplicate headers and it is the
108 | responsibility of the user to make sure that columns are properly named.*
109 | 
110 | ```python
111 | >>> lazy.headers
112 | (b'', b'ALPHA', b'BETA')
113 | >>> (col := lazy.sequence(col=1))
114 | <lazycsv_iterator object at 0x7f599fd86b50>
115 | >>> list(col)
116 | [b'a0', b'a1']
117 | >>> lazy = lazycsv.LazyCSV(FPATH, skip_headers=True)
118 | >>> (col := lazy[:, 1])
119 | <lazycsv_iterator object at 0x7f59d1b21890>
120 | >>> list(col)
121 | [b'ALPHA', b'a0', b'a1']
122 | ```
123 | 
124 | Fields which are double-quoted by default are yielded without quotes. This
125 | behavior can be disabled by passing `unquoted=False` to the object constructor.
126 | 
127 | ```python
128 | >>> lazy = lazycsv.LazyCSV(
129 | ...     "tests/fixtures/file_crlf2.csv"
130 | ... )
131 | >>> lazy.headers
132 | (b'', b'This,that', b'Fizz,Buzz')
133 | >>> lazy = lazycsv.LazyCSV(
134 | ...     "tests/fixtures/file_crlf2.csv", unquote=False
135 | ... )
136 | >>> lazy.headers
137 | (b'', b'"This,that"', b'"Fizz,Buzz"')
138 | ```
139 | 
140 | LazyCSV also provides the option to specify a delimiter and a quote character.
141 | Pass the keywords `delimiter=` and `quotechar=` to the object contstructor to
142 | use custom values. By default, `delimiter` defaults to `,` and `quotechar`
143 | defaults to `"`.
144 | 
145 | ```python
146 | >>> lazy = lazycsv.LazyCSV(
147 | ...     "tests/fixtures/file_delimiter_and_quotechar.csv",
148 | ...     quotechar="|",
149 | ...     delimiter="\t",
150 | ...     unquote=False,
151 | ... )
152 | ...
153 | >>> open(lazy.name, "rb").read()
154 | b'INDEX\tATTR\n0\t|A|\n1\t|B|\n'
155 | >>> list(lazy[:, 1])
156 | [b'|A|', b'|B|']
157 | ```
158 | 
159 | ### Numpy
160 | 
161 | Optional, opt-in numpy support is built into the module. Access to this
162 | extended feature set can be had by building the extension from source while
163 | setting a `LAZYCSV_INCLUDE_NUMPY` environment variable to `1`. This adds a
164 | `to_numpy()` method to the iterator, which allows iterators to materialize in a
165 | 1-dimensional numpy array without creating intermediary PyObject*'s for each
166 | field of the CSV file.
167 | 
168 | Access to this feature requires numpy to be preinstalled as this feature makes
169 | numpy a compilation dependency.
170 | 
171 | ```bash
172 | $ LAZYCSV_INCLUDE_NUMPY=1 python -m pip install lazycsv
173 | ```
174 | ```python
175 | >>> import numpy as np
176 | >>> from lazycsv import lazycsv
177 | >>> lazy = lazycsv.LazyCSV("")
178 | >>> lazy = lazycsv.LazyCSV("./tests/fixtures/file.csv")
179 | >>> lazy.sequence(col=0).to_numpy().astype(np.int8)
180 | array([0, 1], dtype=int8)
181 | ```
182 | 
183 | Users pinned to an older version of numpy (<1.7) may wish to instead compile
184 | using a `LAZYCSV_INCLUDE_NUMPY_LEGACY=1` flag, which drops the API pin in the
185 | module while still compiling with numpy support.
186 | 
187 | #### Benchmarks (CPU)
188 | 
189 | CPU benchmarks are included below, benchmarked on a Ryzen 7 5800X inside a
190 | stock python3.9 docker container.
191 | 
192 | ```
193 | root@aa9d7c7ffb59:/code# python tests/benchmark_lazy.py
194 | filesize: 0.134gb
195 | cols=10000
196 | rows=10000
197 | sparsity=0.95
198 | 
199 | benchmarking lazycsv:
200 | indexing lazy... time to index: 0.450414217018988
201 | parsing cols... time to parse: 1.5233540059998631
202 | total time: 1.9737682230188511
203 | 
204 | benchmarking datatable:
205 | 100% |██████████████████████████████████████████████████| Reading data [done]
206 | creating datatables frame... time to object: 0.40828132900060154
207 | parsing cols... time to parse: 3.810204313998838
208 | total time: 4.21848564299944
209 | 
210 | benchmarking polars (read):
211 | creating polars df... time to object: 2.357821761001105
212 | parsing cols... time to parse: 1.3874979300017003
213 | total time: 3.7453196910028055
214 | ```
215 | 
216 | ```
217 | root@aa9d7c7ffb59:/code# python tests/benchmark_lazy.py
218 | filesize: 1.387gb
219 | cols=10000
220 | rows=100000
221 | sparsity=0.95
222 | 
223 | benchmarking lazycsv:
224 | indexing lazy... time to index: 4.298127760004718
225 | parsing cols... time to parse: 18.591125406033825
226 | total time: 22.889253166038543
227 | 
228 | benchmarking datatable:
229 | 100% |██████████████████████████████████████████████████| Reading data [done]
230 | creating datatables frame... time to object: 2.4456441220027045
231 | parsing cols... time to parse: 37.424315700998704
232 | total time: 39.86995982300141
233 | 
234 | benchmarking polars (read):
235 | creating polars df... time to object: 22.383294907001982
236 | parsing cols... time to parse: 14.16580996599805
237 | total time: 36.54910487300003
238 | ```
239 | 
240 | ```
241 | filesize: 14.333gb
242 | cols=100000
243 | rows=100000
244 | sparsity=0.95
245 | 
246 | benchmarking lazycsv:
247 | indexing lazy... time to index: 55.42112316700002
248 | parsing cols... time to parse: 362.268973717
249 | total time: 417.690096884
250 | 
251 | benchmarking datatable:
252 |  58% |█████████████████████████████▍                    | Reading data Killed
253 | 
254 | benchmarking polars (read):
255 | Killed
256 | ```
257 | 


--------------------------------------------------------------------------------
/run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | versions="3.5 3.6 3.7 3.8 3.9 3.10 3.11 3.12 3.13"
 4 | 
 5 | function run_version_tests {
 6 |     if [ "--build" = $2 ]
 7 |     then
 8 |         args="--inplace --force"
 9 |         LAZYCSV_INCLUDE_NUMPY=1 LAZYCSV_INDEX_DTYPE=uint8_t \
10 |             python setup.py build_ext $args &> /dev/null
11 |     fi
12 |     python -m pytest
13 | }
14 | 
15 | function run_benchmarks {
16 |     python ./tests/benchmark_lazy.py
17 | }
18 | 
19 | function print_help {
20 |     echo "bash commands:"
21 |     echo "- bench: run benchmarks"
22 |     echo "- test: run test suite"
23 |     echo "- testrunner: spin up docker container for testing purposes"
24 |     echo "- tox: run tox"
25 | }
26 | 
27 | function run_testrunner {
28 |     if [ -z $(command -v docker) ]
29 |     then
30 |         echo "environment tests requires docker executable"
31 |         exit 1
32 |     fi
33 | 
34 |     container=lazycsv_testrunner
35 |     if [[ -z $(docker ps -a --format {{.Names}} | grep $container) ]]
36 |     then
37 |         docker run \
38 |             -v $(pwd):/code \
39 |             --name $container \
40 |             -e LAZYCSV_INCLUDE_NUMPY=1 \
41 |             -e PYENV_ROOT="/root/.pyenv" \
42 |             -e PATH="/root/.pyenv/shims:/root/.pyenv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" \
43 |             -dit debian:bookworm
44 |         docker exec $container bash \
45 |             -c "apt-get update && apt-get install git wget tar make gcc build-essential gdb lcov pkg-config libbz2-dev libffi-dev libgdbm-dev libgdbm-compat-dev liblzma-dev libncurses5-dev libreadline6-dev libsqlite3-dev libssl-dev lzma lzma-dev tk-dev uuid-dev zlib1g-dev -y"
46 |         docker exec $container bash \
47 |             -c "cd /root && git clone --depth=1 https://github.com/pyenv/pyenv.git .pyenv"
48 |         docker exec $container bash -c "pyenv install $versions"
49 |         docker exec $container bash -c "pyenv local $versions"
50 |         for version in $versions
51 |         do
52 |             docker exec $container bash \
53 |                 -c "python$version -m pip install numpy pytest && python$version -m pip install -e /code"
54 |         done
55 |         docker exec $container bash \
56 |             -c "python3.11 -m pip install tox"
57 |     else
58 |         docker start $container > /dev/null
59 |     fi
60 |     docker exec -it $container bash
61 | }
62 | 
63 | function run_tox {
64 |     python3.11 -m tox
65 | }
66 | 
67 | function run_debug {
68 |     LAZYCSV_INCLUDE_NUMPY=1 \
69 |     LAZYCSV_INDEX_DTYPE="uint8_t" \
70 |     LAZYCSV_DEBUG=1 \
71 |     CFLAGS="-O0" \
72 |     python setup.py build_ext --inplace --force \
73 |     && gdb --args python -m pytest ${@:2}
74 | }
75 | 
76 | case $1 in
77 |     testrunner) run_testrunner $@ ;;
78 |     test) run_version_tests $@ ;;
79 |     bench) run_benchmarks $@ ;;
80 |     debug) run_debug $@ ;;
81 |     tox) run_tox $@ ;;
82 |     *) print_help $@ ;;
83 | esac
84 | 
85 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import Extension, find_packages, setup
 4 | 
 5 | LAZYCSV_DEBUG = int("LAZYCSV_DEBUG" in os.environ)
 6 | LAZYCSV_INDEX_DTYPE = os.environ.get("LAZYCSV_INDEX_DTYPE", "uint16_t")
 7 | 
 8 | LAZYCSV_INCLUDE_NUMPY = int("LAZYCSV_INCLUDE_NUMPY" in os.environ)
 9 | LAZYCSV_INCLUDE_NUMPY_LEGACY = int("LAZYCSV_INCLUDE_NUMPY_LEGACY" in os.environ)
10 | 
11 | include_dirs = (
12 |     [__import__("numpy").get_include()]
13 |     if (LAZYCSV_INCLUDE_NUMPY | LAZYCSV_INCLUDE_NUMPY_LEGACY)
14 |     else []
15 | )
16 | 
17 | if not LAZYCSV_INDEX_DTYPE.startswith(("unsigned", "uint")):
18 |     raise ValueError("specified LAZYCSV_INDEX_DTYPE must be an unsigned integer type")
19 | 
20 | extensions = [
21 |     Extension(
22 |         "lazycsv.lazycsv",
23 |         [os.path.join("src", "lazycsv", "lazycsv.c")],
24 |         include_dirs=include_dirs,
25 |         define_macros=[
26 |             ("INDEX_DTYPE", LAZYCSV_INDEX_DTYPE),
27 |             ("INCLUDE_NUMPY", LAZYCSV_INCLUDE_NUMPY),
28 |             ("INCLUDE_NUMPY_LEGACY", LAZYCSV_INCLUDE_NUMPY_LEGACY),
29 |             ("DEBUG", LAZYCSV_DEBUG),
30 |         ],
31 |     )
32 | ]
33 | 
34 | with open("README.md", "r", encoding="utf-8") as f:
35 |     long_description = f.read()
36 | 
37 | setup(
38 |     name="lazycsv",
39 |     version="1.1.7",
40 |     author="Michael Green, Chris Perkins",
41 |     author_email="dev@crunch.io",
42 |     description="an fast, memory efficient csv parser",
43 |     long_description=long_description,
44 |     long_description_content_type="text/markdown",
45 |     packages=find_packages(where="src"),
46 |     extras_require={
47 |         "test": ["pytest", "numpy"],
48 |         "benchmark": ["datatable", "pandas", "pyarrow", "polars"],
49 |     },
50 |     classifiers=[
51 |         "Development Status :: 5 - Production/Stable",
52 |         "Intended Audience :: Developers",
53 |         "License :: OSI Approved :: MIT License",
54 |         "Natural Language :: English",
55 |         "Operating System :: POSIX",
56 |         "Operating System :: POSIX :: Linux",
57 |         "Programming Language :: Python :: 3",
58 |         "Programming Language :: Python :: 3.5",
59 |         "Programming Language :: Python :: 3.6",
60 |         "Programming Language :: Python :: 3.7",
61 |         "Programming Language :: Python :: 3.8",
62 |         "Programming Language :: Python :: 3.9",
63 |         "Programming Language :: Python :: 3.10",
64 |         "Programming Language :: Python :: 3.11",
65 |         "Programming Language :: Python :: 3.12",
66 |         "Programming Language :: Python :: 3.13",
67 |         "Programming Language :: Python :: Implementation :: CPython",
68 |         "Topic :: Utilities",
69 |     ],
70 |     package_dir={"": "src"},
71 |     ext_modules=extensions,
72 | )
73 | 


--------------------------------------------------------------------------------
/src/lazycsv/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Crunch-io/lazycsv/7949c63c0b681f270b72cca36a213f4af73bd64d/src/lazycsv/__init__.py


--------------------------------------------------------------------------------
/src/lazycsv/lazycsv.c:
--------------------------------------------------------------------------------
   1 | #include <stdlib.h>
   2 | #include <stdio.h>
   3 | #include <stdint.h>
   4 | #include <fcntl.h>
   5 | #include <sys/mman.h>
   6 | #include <sys/types.h>
   7 | #include <sys/stat.h>
   8 | #include <unistd.h>
   9 | #include <limits.h>
  10 | 
  11 | #include <Python.h>
  12 | #include "structmember.h"
  13 | 
  14 | #define LINE_FEED 10
  15 | #define CARRIAGE_RETURN 13
  16 | 
  17 | // users can set this macro using the env variable LAZYCSV_INDEX_DTYPE if you
  18 | // want to be more aggressive with minimizing index disk usage (i.e. define
  19 | // INDEX_DTYPE as uint8_t) but at a cost to performance.
  20 | 
  21 | #ifndef INDEX_DTYPE
  22 | #define INDEX_DTYPE uint16_t
  23 | #endif
  24 | 
  25 | #ifdef DEBUG
  26 | void PyDebug() {return;}
  27 | #endif
  28 | 
  29 | // optionally include a to_numpy() method on the iterable to materialize into a
  30 | // numpy array, requires numpy install and to be set explicitly using env
  31 | // variable LAZYCSV_INCLUDE_NUMPY=1, and LAZYCSV_INCLUDE_NUMPY_LEGACY=1 to
  32 | // install using legacy numpy APIs
  33 | 
  34 | #ifndef INCLUDE_NUMPY_LEGACY
  35 | #define INCLUDE_NUMPY_LEGACY 0
  36 | #endif
  37 | #if INCLUDE_NUMPY_LEGACY
  38 | #ifdef INCLUDE_NUMPY
  39 | #undef INCLUDE_NUMPY
  40 | #endif
  41 | #define INCLUDE_NUMPY 1
  42 | #else
  43 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
  44 | #endif
  45 | #ifndef INCLUDE_NUMPY
  46 | #define INCLUDE_NUMPY 0
  47 | #endif
  48 | #if INCLUDE_NUMPY
  49 | #include <numpy/arrayobject.h>
  50 | #endif
  51 | 
  52 | 
  53 | static size_t INDEX_DTYPE_MAX = ((INDEX_DTYPE) ~(INDEX_DTYPE)0);
  54 | 
  55 | 
  56 | typedef struct {
  57 |     char* data;
  58 |     size_t size;
  59 |     size_t capacity;
  60 | } LazyCSV_Buffer;
  61 | 
  62 | 
  63 | typedef struct {
  64 |     PyObject* empty;
  65 |     PyObject** items;
  66 | } LazyCSV_Cache;
  67 | 
  68 | 
  69 | typedef struct {
  70 |     size_t col;
  71 |     size_t value;
  72 | } LazyCSV_AnchorPoint;
  73 | 
  74 | 
  75 | typedef struct {
  76 |     size_t index;
  77 |     size_t count;
  78 | } LazyCSV_RowIndex;
  79 | 
  80 | 
  81 | typedef struct {
  82 |     int fd;
  83 |     struct stat st;
  84 |     char* name;
  85 |     char* data;
  86 | } LazyCSV_File;
  87 | 
  88 | 
  89 | typedef struct {
  90 |     PyObject* dir;
  91 |     LazyCSV_File* commas;
  92 |     LazyCSV_File* anchors;
  93 |     LazyCSV_File* newlines;
  94 | } LazyCSV_Index;
  95 | 
  96 | 
  97 | typedef struct {
  98 |     PyObject_HEAD
  99 |     PyObject* headers;
 100 |     PyObject* name;
 101 |     size_t rows;
 102 |     size_t cols;
 103 |     int _skip_headers;
 104 |     int _unquote;
 105 |     char _quotechar;
 106 |     char _newline;
 107 |     LazyCSV_Index* _index;
 108 |     LazyCSV_File* _data;
 109 |     LazyCSV_Cache* _cache;
 110 | } LazyCSV;
 111 | 
 112 | 
 113 | typedef struct {
 114 |     PyObject_HEAD
 115 |     PyObject* lazy;
 116 |     size_t row;
 117 |     size_t col;
 118 |     size_t position;
 119 |     size_t stop;
 120 |     size_t step;
 121 |     char reversed;
 122 | } LazyCSV_Iter;
 123 | 
 124 | 
 125 | static inline ssize_t LazyCSV_BufferWrite(int fd, LazyCSV_Buffer *buffer,
 126 |                                        void *data, size_t size) {
 127 | 
 128 |     ssize_t bytes_written = 0;
 129 | 
 130 |     if (buffer->size + size >= buffer->capacity) {
 131 |         bytes_written = write(fd, buffer->data, buffer->size);
 132 |         buffer->size = 0;
 133 |     }
 134 |     memcpy(&buffer->data[buffer->size], data, size);
 135 |     buffer->size += size;
 136 |     return bytes_written;
 137 | }
 138 | 
 139 | 
 140 | static inline void LazyCSV_BufferCache(LazyCSV_Buffer *buffer, void *data,
 141 |                                        size_t size) {
 142 | 
 143 |     if (size == 0) return;
 144 | 
 145 |     if (buffer->size + size >= buffer->capacity) {
 146 |         buffer->capacity += size;
 147 |         buffer->capacity *= 1.3;
 148 |         buffer->data = realloc(buffer->data, buffer->capacity);
 149 |     }
 150 |     memcpy(&buffer->data[buffer->size], data, size);
 151 |     buffer->size += size;
 152 | }
 153 | 
 154 | 
 155 | static inline ssize_t LazyCSV_BufferFlush(int comma_file, LazyCSV_Buffer *buffer) {
 156 |     ssize_t bytes_written = 0;
 157 |     bytes_written = write(comma_file, buffer->data, buffer->size);
 158 |     if (bytes_written < 0) {
 159 |         return bytes_written;
 160 |     }
 161 |     buffer->size = 0;
 162 |     fsync(comma_file);
 163 |     return bytes_written;
 164 | }
 165 | 
 166 | 
 167 | static inline ssize_t LazyCSV_ValueToDisk(size_t value, LazyCSV_RowIndex *ridx,
 168 |                                        LazyCSV_AnchorPoint *apnt,
 169 |                                        size_t col_index, int cfile,
 170 |                                        LazyCSV_Buffer *cbuf, int afile,
 171 |                                        LazyCSV_Buffer *abuf) {
 172 | 
 173 |     size_t target = value - apnt->value;
 174 | 
 175 |     if (target > INDEX_DTYPE_MAX) {
 176 |         *apnt = (LazyCSV_AnchorPoint){.value = value, .col = col_index+1};
 177 |         ssize_t bytes_written = LazyCSV_BufferWrite(afile, abuf, apnt, sizeof(LazyCSV_AnchorPoint));
 178 |         if (bytes_written < 0)
 179 |             return bytes_written;
 180 |         ridx->count += 1;
 181 |         target = 0;
 182 |     }
 183 | 
 184 |     INDEX_DTYPE item = target;
 185 | 
 186 |     return LazyCSV_BufferWrite(cfile, cbuf, &item, sizeof(INDEX_DTYPE));
 187 | }
 188 | 
 189 | 
 190 | static inline size_t LazyCSV_AnchorValueFromValue(size_t value,
 191 |                                                   LazyCSV_AnchorPoint *amap,
 192 |                                                   LazyCSV_RowIndex *ridx) {
 193 | 
 194 |     LazyCSV_AnchorPoint *apnt = amap + ridx->count - 1;
 195 | 
 196 |     if (value >= apnt->col) {
 197 |         // we hit this if there is only one anchor point, or we're iterating
 198 |         // over the last anchor point.
 199 |         return apnt->value;
 200 |     }
 201 | 
 202 |     LazyCSV_AnchorPoint* apntp1;
 203 |     size_t L = 0, R = ridx->count-1;
 204 | 
 205 |     while (L <= R) {
 206 |         size_t M = L + ((R - L)/2);
 207 |         apnt = amap + M;
 208 |         apntp1 = apnt + 1;
 209 |         if (value > apntp1->col) {
 210 |             L = M + 1;
 211 |         }
 212 |         else if (value < apnt->col) {
 213 |             R = M - 1;
 214 |         }
 215 |         else if (value == apntp1->col) {
 216 |             return apntp1->value;
 217 |         }
 218 |         else {
 219 |             return apnt->value;
 220 |         }
 221 |     }
 222 |     return SIZE_MAX;
 223 | }
 224 | 
 225 | 
 226 | static inline size_t LazyCSV_ValueFromIndex(size_t value,
 227 |                                             LazyCSV_RowIndex *ridx, char *cmap,
 228 |                                             char *amap) {
 229 | 
 230 |     size_t cval = *(INDEX_DTYPE *)(cmap + (value * sizeof(INDEX_DTYPE)));
 231 |     size_t aval =
 232 |         LazyCSV_AnchorValueFromValue(value, (LazyCSV_AnchorPoint *)amap, ridx);
 233 |     return aval == SIZE_MAX ? aval : cval + aval;
 234 | }
 235 | 
 236 | 
 237 | static inline void LazyCSV_IterCol(LazyCSV_Iter *iter, size_t *offset,
 238 |                                    size_t *len) {
 239 | 
 240 |     LazyCSV *lazy = (LazyCSV *)iter->lazy;
 241 | 
 242 |     if (iter->position < iter->stop) {
 243 |         size_t position =
 244 |             iter->reversed
 245 |                 ? lazy->rows - 1 - iter->position + !lazy->_skip_headers
 246 |                 : iter->position + !lazy->_skip_headers;
 247 | 
 248 |         iter->position += iter->step;
 249 | 
 250 |         char* newlines = lazy->_index->newlines->data;
 251 |         char* anchors = lazy->_index->anchors->data;
 252 |         char* commas = lazy->_index->commas->data;
 253 | 
 254 |         LazyCSV_RowIndex* ridx =
 255 |             (LazyCSV_RowIndex*)
 256 |             (newlines + position*sizeof(LazyCSV_RowIndex));
 257 | 
 258 |         char* aidx = anchors+ridx->index;
 259 |         char* cidx = commas+((lazy->cols+1)*position*sizeof(INDEX_DTYPE));
 260 | 
 261 |         size_t cs = LazyCSV_ValueFromIndex(iter->col, ridx, cidx, aidx);
 262 |         size_t ce = LazyCSV_ValueFromIndex(iter->col + 1, ridx, cidx, aidx);
 263 | 
 264 |         *len = ce - cs - 1;
 265 |         *offset = cs;
 266 |     }
 267 | }
 268 | 
 269 | 
 270 | static inline void LazyCSV_IterRow(LazyCSV_Iter *iter, size_t *offset,
 271 |                                    size_t *len) {
 272 | 
 273 |     LazyCSV *lazy = (LazyCSV *)iter->lazy;
 274 | 
 275 |     if (iter->position < iter->stop) {
 276 |         size_t position =
 277 |             iter->reversed ? lazy->cols - iter->position - 1 : iter->position;
 278 | 
 279 |         iter->position += iter->step;
 280 | 
 281 |         char* newlines = lazy->_index->newlines->data;
 282 |         char* anchors = lazy->_index->anchors->data;
 283 |         char* commas = lazy->_index->commas->data;
 284 | 
 285 |         size_t row = iter->row + !lazy->_skip_headers;
 286 | 
 287 |         LazyCSV_RowIndex* ridx =
 288 |             (LazyCSV_RowIndex*)
 289 |             (newlines + row*sizeof(LazyCSV_RowIndex));
 290 | 
 291 |         char *aidx = anchors + ridx->index;
 292 |         char *cidx = commas + ((lazy->cols + 1) * row * sizeof(INDEX_DTYPE));
 293 | 
 294 |         size_t cs = LazyCSV_ValueFromIndex(position, ridx, cidx, aidx);
 295 |         size_t ce = LazyCSV_ValueFromIndex(position + 1, ridx, cidx, aidx);
 296 | 
 297 |         *len = ce - cs - 1;
 298 |         *offset = cs;
 299 |     }
 300 | }
 301 | 
 302 | 
 303 | static inline PyObject *PyBytes_FromOffsetAndLen(LazyCSV *lazy, size_t offset,
 304 |                                                  size_t len) {
 305 | 
 306 |     PyObject* result;
 307 |     char* addr;
 308 | 
 309 |     switch (len) {
 310 |     case SIZE_MAX:
 311 |     case 0:
 312 |         // short circuit if result is empty string
 313 |         result = lazy->_cache->empty;
 314 |         Py_INCREF(result);
 315 |         break;
 316 |     case 1:
 317 |         addr = lazy->_data->data + offset;
 318 |         unsigned char index = *addr; // explicit unsigned char for indexing purposes,
 319 |                                      // *(char*) signed-ness is ambiguous and on some
 320 |                                      // architectures it results in a negative number
 321 |         result = lazy->_cache->items[index];
 322 |         Py_INCREF(result);
 323 |         break;
 324 |     default:
 325 |         addr = lazy->_data->data + offset;
 326 | 
 327 |         char strip_quotes = (
 328 |             lazy->_unquote
 329 |             && addr[0] == lazy->_quotechar
 330 |             && addr[len-1] == lazy->_quotechar
 331 |         );
 332 | 
 333 |         if (strip_quotes) {
 334 |             addr = addr+1;
 335 |             len = len-2;
 336 |         }
 337 | 
 338 |         result = PyBytes_FromStringAndSize(addr, len);
 339 | 
 340 |         if (!result) {
 341 |             PyErr_SetString(
 342 |                 PyExc_RuntimeError,
 343 |                 "could not allocate memory for new object"
 344 |             );
 345 |             return NULL;
 346 |         }
 347 |     }
 348 | 
 349 |     return result;
 350 | }
 351 | 
 352 | 
 353 | static PyObject* LazyCSV_IterNext(PyObject* self) {
 354 |     LazyCSV_Iter* iter = (LazyCSV_Iter*)self;
 355 |     LazyCSV *lazy = (LazyCSV *)iter->lazy;
 356 | 
 357 |     size_t offset = SIZE_MAX, len;
 358 | 
 359 |     switch ((iter->row == SIZE_MAX) - (iter->col == SIZE_MAX)) {
 360 |     case -1:
 361 |         LazyCSV_IterRow(iter, &offset, &len);
 362 |         break;
 363 |     case +1:
 364 |         LazyCSV_IterCol(iter, &offset, &len);
 365 |         break;
 366 |     default:
 367 |         PyErr_SetString(
 368 |             PyExc_RuntimeError,
 369 |             "could not determine axis for materialization"
 370 |         );
 371 |         return NULL;
 372 |     }
 373 | 
 374 |     if (offset==SIZE_MAX) {
 375 |         PyErr_SetNone(PyExc_StopIteration);
 376 |         return NULL;
 377 |     }
 378 | 
 379 |     return PyBytes_FromOffsetAndLen(lazy, offset, len);
 380 | }
 381 | 
 382 | 
 383 | static PyObject* LazyCSV_IterAsList(PyObject* self) {
 384 |     LazyCSV_Iter* iter = (LazyCSV_Iter*)self;
 385 |     LazyCSV* lazy = (LazyCSV*)iter->lazy;
 386 | 
 387 |     size_t size;
 388 |     size_t iter_col = iter->col;
 389 |     size_t iter_row = iter->row;
 390 | 
 391 |     if (iter_col == SIZE_MAX) {
 392 |         size = lazy->cols - iter->position;
 393 |     }
 394 |     else if (iter_row == SIZE_MAX) {
 395 |         size = lazy->rows - iter->position;
 396 |     }
 397 |     else {
 398 |         PyErr_SetString(
 399 |             PyExc_RuntimeError,
 400 |             "could not determine axis for materialization"
 401 |         );
 402 |         return NULL;
 403 |     }
 404 | 
 405 |     PyObject* result = PyList_New(size);
 406 |     if (!result) {
 407 |         PyErr_SetString(
 408 |             PyExc_RuntimeError,
 409 |             "could not allocate memory for new list"
 410 |         );
 411 |         return NULL;
 412 |     }
 413 |     size_t offset=SIZE_MAX, len=0;
 414 | 
 415 |     PyObject* item;
 416 |     for (size_t i = 0; i < size; i++) {
 417 |         switch (iter_col) {
 418 |         case SIZE_MAX:
 419 |             LazyCSV_IterRow(iter, &offset, &len);
 420 |             break;
 421 |         default:
 422 |             LazyCSV_IterCol(iter, &offset, &len);
 423 |         }
 424 |         item = PyBytes_FromOffsetAndLen(lazy, offset, len);
 425 |         if (!item) {
 426 |             // err msg set in PyBytes_FromOffsetAndLen,
 427 |             // just need to decref the list here
 428 |             Py_DECREF(result);
 429 |             return item;
 430 |         }
 431 |         PyList_SET_ITEM(result, i, item);
 432 |     }
 433 | 
 434 |     return result;
 435 | }
 436 | 
 437 | 
 438 | #if INCLUDE_NUMPY
 439 | static PyObject* LazyCSV_IterAsNumpy(PyObject* self) {
 440 |     LazyCSV_Iter* iter = (LazyCSV_Iter*)self;
 441 |     LazyCSV* lazy = (LazyCSV*)iter->lazy;
 442 | 
 443 |     size_t size;
 444 |     size_t iter_col = iter->col;
 445 |     size_t iter_row = iter->row;
 446 | 
 447 |     if (iter_col != SIZE_MAX) {
 448 |         size = lazy->rows - iter->position;
 449 |     }
 450 |     else if (iter_row != SIZE_MAX) {
 451 |         size = lazy->cols - iter->position;
 452 |     }
 453 |     else {
 454 |         PyErr_SetString(
 455 |             PyExc_RuntimeError,
 456 |             "could not determine axis for materialization"
 457 |         );
 458 |         return NULL;
 459 |     }
 460 | 
 461 |     size_t buffer_capacity = 65536; // 2**16
 462 |     LazyCSV_Buffer buffer = {.data = malloc(buffer_capacity),
 463 |                              .size = 0,
 464 |                              .capacity = buffer_capacity};
 465 | 
 466 |     size_t offset, len=0, max_len=0;
 467 |     char* addr;
 468 | 
 469 |     for (size_t i=0; i < size; i++) {
 470 |         switch (iter_col) {
 471 |         case SIZE_MAX:
 472 |             LazyCSV_IterRow(iter, &offset, &len);
 473 |             break;
 474 |         default:
 475 |             LazyCSV_IterCol(iter, &offset, &len);
 476 |         }
 477 |         addr = lazy->_data->data + offset;
 478 |         LazyCSV_BufferCache(&buffer, &len, sizeof(size_t));
 479 |         LazyCSV_BufferCache(&buffer, addr, len);
 480 |         max_len = len > max_len ? len : max_len;
 481 |     }
 482 | 
 483 |     npy_intp const dimensions[1] = {size, };
 484 |     npy_intp const strides[1] = {max_len, };
 485 | 
 486 |     PyArrayObject *arr =
 487 |         (PyArrayObject *)PyArray_New(&PyArray_Type, 1, dimensions, NPY_STRING,
 488 |                                      strides, NULL, max_len, 0, NULL);
 489 | 
 490 |     if (!arr) {
 491 |         free(buffer.data);
 492 |         PyErr_SetString(
 493 |             PyExc_RuntimeError,
 494 |             "could not allocate numpy array"
 495 |         );
 496 |         return NULL;
 497 |     }
 498 | 
 499 |     char* tempbuf = buffer.data;
 500 |     char* arrdata = PyArray_DATA(arr);
 501 | 
 502 |     for (size_t i = 0; i < size; i++) {
 503 |         len = *(size_t *)tempbuf;
 504 |         tempbuf += sizeof(size_t);
 505 |         size_t padlen = max_len - len;
 506 |         strncpy(arrdata, tempbuf, len);
 507 |         tempbuf += len;
 508 |         arrdata += len;
 509 |         memset(arrdata, 0, padlen);
 510 |         arrdata += padlen;
 511 |     }
 512 | 
 513 |     free(buffer.data);
 514 | 
 515 |     return PyArray_Return(arr);
 516 | }
 517 | #endif
 518 | 
 519 | 
 520 | static PyObject* LazyCSV_IterSelf(PyObject* self) {
 521 |     Py_INCREF(self);
 522 |     return self;
 523 | }
 524 | 
 525 | 
 526 | static void LazyCSV_IterDestruct(LazyCSV_Iter* self) {
 527 |     Py_DECREF(self->lazy);
 528 |     Py_TYPE(self)->tp_free((PyObject*)self);
 529 | }
 530 | 
 531 | 
 532 | static PyMethodDef LazyCSV_IterMethods[] = {
 533 | #if INCLUDE_NUMPY
 534 |     {
 535 |         "to_numpy",
 536 |         (PyCFunction)LazyCSV_IterAsNumpy,
 537 |         METH_NOARGS,
 538 |         "materialize iterator into a numpy array"
 539 |     },
 540 | #endif
 541 |     {
 542 |         "to_list",
 543 |         (PyCFunction)LazyCSV_IterAsList,
 544 |         METH_NOARGS,
 545 |         "materialize iterator into a list"
 546 |     },
 547 |     {NULL, }
 548 | };
 549 | 
 550 | 
 551 | static PyTypeObject LazyCSV_IterType = {
 552 |     PyVarObject_HEAD_INIT(NULL, 0)
 553 |     .tp_name = "lazycsv_iterator",
 554 |     .tp_itemsize = sizeof(LazyCSV_Iter),
 555 |     .tp_dealloc = (destructor)LazyCSV_IterDestruct,
 556 |     .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
 557 |     .tp_doc = "LazyCSV iterable",
 558 |     .tp_methods = LazyCSV_IterMethods,
 559 |     .tp_iter = LazyCSV_IterSelf,
 560 |     .tp_iternext = LazyCSV_IterNext,
 561 | };
 562 | 
 563 | 
 564 | static inline void LazyCSV_TempDirAsString(PyObject **tempdir, char **dirname) {
 565 |     PyObject *tempfile = PyImport_ImportModule("tempfile");
 566 |     PyObject *tempdir_obj =
 567 |         PyObject_GetAttrString(tempfile, "TemporaryDirectory");
 568 | 
 569 |     *tempdir = PyObject_CallObject(tempdir_obj, NULL);
 570 |     PyObject* dirname_obj = PyObject_GetAttrString(*tempdir, "name");
 571 |     PyObject* dirstring = PyUnicode_AsUTF8String(dirname_obj);
 572 |     *dirname = PyBytes_AsString(dirstring);
 573 | 
 574 |     Py_DECREF(tempfile);
 575 |     Py_DECREF(tempdir_obj);
 576 |     Py_DECREF(dirname_obj);
 577 |     Py_DECREF(dirstring);
 578 | }
 579 | 
 580 | 
 581 | static inline void LazyCSV_FullNameFromName(PyObject *name,
 582 |                                             PyObject **fullname_obj,
 583 |                                             char **fullname) {
 584 | 
 585 |     PyObject *os_path = PyImport_ImportModule("os.path");
 586 |     PyObject *isfile = PyObject_CallMethod(os_path, "isfile", "O", name);
 587 | 
 588 |     PyObject *builtins = PyImport_ImportModule("builtins");
 589 |     PyObject* global_vars = PyObject_CallMethod(builtins, "globals", NULL);
 590 | 
 591 |     // borrowed ref
 592 |     PyObject* __file__ = PyDict_GetItemString(global_vars, "__file__");
 593 | 
 594 |     if (isfile == Py_True) {
 595 |         // owned reference which we keep
 596 |         *fullname_obj = PyObject_CallMethod(os_path, "abspath", "O", name);
 597 |         *fullname = PyBytes_AsString(*fullname_obj);
 598 |     }
 599 | 
 600 |     else if (__file__) {
 601 |         // also check to see if file is relative to the caller if not
 602 |         // previously found
 603 |         PyObject *_dirname =
 604 |             PyObject_CallMethod(os_path, "dirname", "O", __file__);
 605 | 
 606 |         PyObject *dirname = PyUnicode_AsUTF8String(_dirname);
 607 | 
 608 |         PyObject *joined =
 609 |             PyObject_CallMethod(os_path, "join", "(OO)", dirname, name);
 610 | 
 611 |         *fullname_obj = PyObject_CallMethod(os_path, "abspath", "O", joined);
 612 |         *fullname = PyBytes_AsString(*fullname_obj);
 613 | 
 614 |         Py_DECREF(joined);
 615 |         Py_DECREF(_dirname);
 616 |         Py_DECREF(dirname);
 617 |     }
 618 | 
 619 |     Py_DECREF(os_path);
 620 |     Py_DECREF(isfile);
 621 |     Py_DECREF(builtins);
 622 |     Py_DECREF(global_vars);
 623 | }
 624 | 
 625 | 
 626 | static PyObject *LazyCSV_New(PyTypeObject *type, PyObject *args,
 627 |                              PyObject *kwargs) {
 628 | 
 629 |     PyObject* name;
 630 |     int skip_headers = 0;
 631 |     int unquote = 1;
 632 |     Py_ssize_t buffer_capacity = 2097152; // 2**21
 633 |     char *dirname, *delimiter = ",", *quotechar = "\"";
 634 |     ssize_t bytes_written = 0;
 635 | 
 636 |     static char* kwlist[] = {
 637 |         "", "delimiter", "quotechar", "skip_headers", "unquote", "buffer_size", "index_dir", NULL
 638 |     };
 639 | 
 640 |     char ok = PyArg_ParseTupleAndKeywords(
 641 |         args, kwargs, "O|ssppns", kwlist, &name, &delimiter, &quotechar,
 642 |         &skip_headers, &unquote, &buffer_capacity, &dirname);
 643 | 
 644 |     if (!ok) {
 645 |         PyErr_SetString(
 646 |             PyExc_ValueError,
 647 |             "unable to parse function arguments"
 648 |         );
 649 |         return NULL;
 650 |     }
 651 | 
 652 |     if (buffer_capacity < 0) {
 653 |         PyErr_SetString(
 654 |             PyExc_ValueError,
 655 |             "buffer size cannot be less than 0"
 656 |         );
 657 |         return NULL;
 658 |     }
 659 | 
 660 |     Py_INCREF(name);
 661 |     if (PyUnicode_CheckExact(name)) {
 662 |         PyObject* _name = PyUnicode_AsUTF8String(name);
 663 |         Py_DECREF(name);
 664 |         name = _name;
 665 |     }
 666 | 
 667 |     if (!PyBytes_CheckExact(name)) {
 668 |         PyErr_SetString(
 669 |             PyExc_ValueError,
 670 |             "first argument must be str or bytes"
 671 |         );
 672 |         Py_DECREF(name);
 673 |         return NULL;
 674 |     }
 675 | 
 676 |     PyObject* fullname_obj = NULL;
 677 |     char* fullname = NULL;
 678 |     LazyCSV_FullNameFromName(name, &fullname_obj, &fullname);
 679 | 
 680 |     if (!fullname_obj) {
 681 |         PyErr_SetString(
 682 |             PyExc_MemoryError,
 683 |             "unable to initialize filepath object"
 684 |         );
 685 |         return NULL;
 686 |     }
 687 | 
 688 |     if (!fullname) {
 689 |         Py_XDECREF(fullname_obj);
 690 |         PyErr_SetString(
 691 |             PyExc_MemoryError,
 692 |             "unable to determine file path from *PyObject"
 693 |         );
 694 |         return NULL;
 695 |     }
 696 | 
 697 |     Py_DECREF(name);
 698 | 
 699 |     int ufd = open(fullname, O_RDONLY);
 700 |     if (ufd == -1) {
 701 |         PyErr_SetString(
 702 |             PyExc_FileNotFoundError,
 703 |             "unable to open data file,"
 704 |             " check to be sure that the user has read permissions"
 705 |             " and/or ownership of the file, and that the file exists."
 706 |         );
 707 |         return NULL;
 708 |     }
 709 | 
 710 |     struct stat ust;
 711 |     if (fstat(ufd, &ust) < 0) {
 712 |         PyErr_SetString(
 713 |             PyExc_RuntimeError,
 714 |             "unable to stat user file"
 715 |         );
 716 |         goto close_ufd;
 717 |     }
 718 | 
 719 |     size_t file_len = ust.st_size;
 720 | 
 721 |     int mmap_flags = PROT_READ;
 722 |     char* file = mmap(NULL, file_len, mmap_flags, MAP_PRIVATE, ufd, 0);
 723 | 
 724 |     PyObject* tempdir = NULL;
 725 |     if (!dirname) {
 726 |         LazyCSV_TempDirAsString(&tempdir, &dirname);
 727 |     }
 728 | 
 729 |     char* comma_index = tempnam(dirname, "LzyC_");
 730 |     char* anchor_index = tempnam(dirname, "LzyA_");
 731 |     char* newline_index = tempnam(dirname, "LzyN_");
 732 | 
 733 |     int file_flags = O_WRONLY|O_CREAT|O_EXCL;
 734 | 
 735 |     int comma_file = open(comma_index, file_flags, S_IRWXU);
 736 |     if (comma_file < 0)
 737 |         goto close_comma_file;
 738 | 
 739 |     int anchor_file = open(anchor_index, file_flags, S_IRWXU);
 740 |     if (anchor_file < 0)
 741 |         goto close_anchor_file;
 742 | 
 743 |     int newline_file = open(newline_index, file_flags, S_IRWXU);
 744 |     if (newline_file < 0)
 745 |         goto close_newline_file;
 746 | 
 747 |     char quoted = 0, c, cm1 = LINE_FEED, cm2 = 0;
 748 |     size_t rows = 0, cols = SIZE_MAX, row_index = 0, col_index = 0;
 749 | 
 750 |     int newline = -1;
 751 | 
 752 |     // overflow happens when a row has more columns than the header row,
 753 |     // if this happens during the parse, the comma of the nth col will indicate
 754 |     // the line ending. Underflow happens when a row has less columns than the
 755 |     // header row, and missing values will be appended to the row as an empty
 756 |     // field.
 757 | 
 758 |     size_t overflow = SIZE_MAX;
 759 |     char *overflow_warning = NULL, *underflow_warning = NULL;
 760 | 
 761 |     LazyCSV_RowIndex ridx = {.index = 0, .count = 0};
 762 | 
 763 |     LazyCSV_AnchorPoint apnt;
 764 | 
 765 |     LazyCSV_Buffer comma_buffer = {.data = malloc(buffer_capacity),
 766 |                                    .size = 0,
 767 |                                    .capacity = buffer_capacity};
 768 | 
 769 |     LazyCSV_Buffer anchor_buffer = {.data = malloc(buffer_capacity),
 770 |                                     .size = 0,
 771 |                                     .capacity = buffer_capacity};
 772 | 
 773 |     LazyCSV_Buffer newline_buffer = {.data = malloc(buffer_capacity),
 774 |                                      .size = 0,
 775 |                                      .capacity = buffer_capacity};
 776 | 
 777 |     for (size_t i = 0; i < file_len; i++) {
 778 | 
 779 |         if (overflow != SIZE_MAX && i < overflow) {
 780 |             continue;
 781 |         }
 782 | 
 783 |         c = file[i];
 784 | 
 785 |         if (col_index == 0
 786 |             && (cm1 == LINE_FEED || cm1 == CARRIAGE_RETURN)
 787 |             && cm2 != CARRIAGE_RETURN) {
 788 |             size_t val = (
 789 |                 newline == (CARRIAGE_RETURN+LINE_FEED)
 790 |             ) ? i + 1 : i;
 791 | 
 792 |             apnt = (LazyCSV_AnchorPoint){.value = val, .col = col_index};
 793 | 
 794 |             LazyCSV_BufferWrite(anchor_file, &anchor_buffer, &apnt,
 795 |                                 sizeof(LazyCSV_AnchorPoint));
 796 | 
 797 |             ridx.index += ridx.count*sizeof(LazyCSV_AnchorPoint);
 798 |             ridx.count = 1;
 799 | 
 800 |             bytes_written = LazyCSV_ValueToDisk(val, &ridx, &apnt, col_index, comma_file,
 801 |                                 &comma_buffer, anchor_file, &anchor_buffer);
 802 |             if (bytes_written < 0)
 803 |                 goto close_newline_file;
 804 |         }
 805 | 
 806 |         if (c == *quotechar) {
 807 |             quoted = !quoted;
 808 |         }
 809 | 
 810 |         else if (!quoted && c == *delimiter) {
 811 |             size_t val = i + 1;
 812 |             bytes_written = LazyCSV_ValueToDisk(val, &ridx, &apnt, col_index, comma_file,
 813 |                                 &comma_buffer, anchor_file, &anchor_buffer);
 814 |             if (bytes_written < 0)
 815 |                 goto close_newline_file;
 816 |             if (cols == SIZE_MAX || col_index < cols) {
 817 |                 col_index += 1;
 818 |             }
 819 |             else {
 820 |                 overflow_warning =
 821 |                     "column overflow encountered while parsing CSV, "
 822 |                     "extra values will be truncated!";
 823 |                 overflow = i;
 824 | 
 825 |                 for (;;) {
 826 |                     if (file[overflow] == LINE_FEED ||
 827 |                         file[overflow] == CARRIAGE_RETURN)
 828 |                         break;
 829 |                     else if (overflow >= file_len)
 830 |                         break;
 831 |                   overflow += 1;
 832 |                 }
 833 |             }
 834 |         }
 835 | 
 836 |         else if (!quoted && c == LINE_FEED && cm1 == CARRIAGE_RETURN) {
 837 |             // no-op, don't match next block for \r\n
 838 |         }
 839 | 
 840 |         else if (!quoted && (c == CARRIAGE_RETURN || c == LINE_FEED)) {
 841 |             size_t val = i + 1;
 842 | 
 843 |             if (overflow == SIZE_MAX) {
 844 |                 bytes_written = LazyCSV_ValueToDisk(val, &ridx, &apnt, col_index, comma_file,
 845 |                                     &comma_buffer, anchor_file, &anchor_buffer);
 846 |                 if (bytes_written < 0)
 847 |                     goto close_newline_file;
 848 |             }
 849 |             else {
 850 |                 overflow = SIZE_MAX;
 851 |             }
 852 | 
 853 |             if (row_index == 0) {
 854 |                 cols = col_index;
 855 |             }
 856 | 
 857 |             else if (col_index < cols) {
 858 |                 underflow_warning =
 859 |                     "column underflow encountered while parsing CSV, "
 860 |                     "missing values will be filled with the empty bytestring!";
 861 |                 while (col_index < cols) {
 862 |                     bytes_written = LazyCSV_ValueToDisk(val, &ridx, &apnt, col_index, comma_file,
 863 |                                       &comma_buffer, anchor_file,
 864 |                                       &anchor_buffer);
 865 |                     if (bytes_written < 0)
 866 |                         goto close_newline_file;
 867 |                   col_index += 1;
 868 |                 }
 869 |             }
 870 | 
 871 |             if (newline == -1) {
 872 |                 newline = (c == CARRIAGE_RETURN && file[i + 1] == LINE_FEED)
 873 |                               ? LINE_FEED + CARRIAGE_RETURN
 874 |                               : c;
 875 |             }
 876 | 
 877 |             LazyCSV_BufferWrite(newline_file, &newline_buffer, &ridx,
 878 |                                 sizeof(LazyCSV_RowIndex));
 879 | 
 880 |             col_index = 0;
 881 |             row_index += 1;
 882 |         }
 883 | 
 884 |         cm2 = cm1;
 885 |         cm1 = c;
 886 |     }
 887 | 
 888 |     char last_char = file[file_len - 1];
 889 |     char overcount = last_char == CARRIAGE_RETURN || last_char == LINE_FEED;
 890 | 
 891 |     if (!overcount) {
 892 |         bytes_written = LazyCSV_ValueToDisk(file_len + 1, &ridx, &apnt, col_index, comma_file,
 893 |                             &comma_buffer, anchor_file, &anchor_buffer);
 894 |         if (bytes_written < 0)
 895 |             goto close_newline_file;
 896 | 
 897 |         bytes_written = LazyCSV_BufferWrite(newline_file, &newline_buffer, &ridx,
 898 |                             sizeof(LazyCSV_RowIndex));
 899 | 
 900 |         if (bytes_written < 0)
 901 |             goto close_newline_file;
 902 |     }
 903 | 
 904 |     if (overflow_warning)
 905 |         PyErr_WarnEx(
 906 |             PyExc_RuntimeWarning,
 907 |             overflow_warning,
 908 |             1
 909 |         );
 910 | 
 911 |     if (underflow_warning)
 912 |         PyErr_WarnEx(
 913 |             PyExc_RuntimeWarning,
 914 |             underflow_warning,
 915 |             1
 916 |         );
 917 | 
 918 |     rows = row_index - overcount + skip_headers;
 919 |     cols = cols + 1;
 920 | 
 921 |     bytes_written = LazyCSV_BufferFlush(comma_file, &comma_buffer);
 922 |     if (bytes_written < 0)
 923 |         goto close_newline_file;
 924 |     bytes_written = LazyCSV_BufferFlush(anchor_file, &anchor_buffer);
 925 |     if (bytes_written < 0)
 926 |         goto close_newline_file;
 927 |     bytes_written = LazyCSV_BufferFlush(newline_file, &newline_buffer);
 928 |     if (bytes_written < 0)
 929 |         goto close_newline_file;
 930 | 
 931 |     close(comma_file);
 932 |     close(anchor_file);
 933 |     close(newline_file);
 934 | 
 935 |     free(comma_buffer.data);
 936 |     free(anchor_buffer.data);
 937 |     free(newline_buffer.data);
 938 | 
 939 |     comma_file = open(comma_index, O_RDWR);
 940 |     struct stat comma_st;
 941 |     if (fstat(comma_file, &comma_st) < 0) {
 942 |         PyErr_SetString(
 943 |             PyExc_RuntimeError,
 944 |             "unable to stat comma file"
 945 |         );
 946 |         goto close_comma_file;
 947 |     }
 948 | 
 949 |     anchor_file = open(anchor_index, O_RDWR);
 950 |     struct stat anchor_st;
 951 |     if (fstat(anchor_file, &anchor_st) < 0) {
 952 |         PyErr_SetString(
 953 |             PyExc_RuntimeError,
 954 |             "unable to stat anchor file"
 955 |         );
 956 |         goto close_anchor_file;
 957 |     }
 958 | 
 959 |     newline_file = open(newline_index, O_RDWR);
 960 |     struct stat newline_st;
 961 |     if (fstat(newline_file, &newline_st) < 0) {
 962 |         PyErr_SetString(
 963 |             PyExc_RuntimeError,
 964 |             "unable to stat newline file"
 965 |         );
 966 |         goto close_newline_file;
 967 |     }
 968 | 
 969 |     char *comma_memmap =
 970 |         mmap(NULL, comma_st.st_size, mmap_flags, MAP_PRIVATE, comma_file, 0);
 971 |     char *anchor_memmap =
 972 |         mmap(NULL, anchor_st.st_size, mmap_flags, MAP_PRIVATE, anchor_file, 0);
 973 |     char *newline_memmap =
 974 |         mmap(NULL, newline_st.st_size, mmap_flags, MAP_PRIVATE, newline_file, 0);
 975 | 
 976 |     PyObject* headers;
 977 | 
 978 |     if (!skip_headers) {
 979 |         headers = PyTuple_New(cols);
 980 |         LazyCSV_RowIndex* ridx = (LazyCSV_RowIndex*)newline_memmap;
 981 | 
 982 |         size_t cs, ce;
 983 |         size_t len;
 984 |         char *addr;
 985 |         for (size_t i = 0; i < cols; i++) {
 986 |             cs = LazyCSV_ValueFromIndex(i, ridx, comma_memmap, anchor_memmap);
 987 |             ce = LazyCSV_ValueFromIndex(i + 1, ridx, comma_memmap,
 988 |                                         anchor_memmap);
 989 | 
 990 |             if (ce - cs == 1) {
 991 |                 PyTuple_SET_ITEM(headers, i, PyBytes_FromString(""));
 992 |             }
 993 |             else {
 994 |                 addr = file + cs;
 995 | 
 996 |                 len = ce - cs - 1;
 997 |                 if (unquote
 998 |                         && addr[0] == *quotechar
 999 |                         && addr[len-1] == *quotechar) {
1000 |                     addr = addr+1;
1001 |                     len = len-2;
1002 |                 }
1003 |                 PyTuple_SET_ITEM(
1004 |                     headers, i, PyBytes_FromStringAndSize(addr, len)
1005 |                 );
1006 |             }
1007 |         }
1008 |     }
1009 |     else {
1010 |         headers = PyTuple_New(0);
1011 |     }
1012 | 
1013 |     LazyCSV* self = (LazyCSV*)type->tp_alloc(type, 0);
1014 |     if (!self) {
1015 |         PyErr_SetString(
1016 |             PyExc_MemoryError,
1017 |             "unable to allocate LazyCSV object"
1018 |         );
1019 |         goto unmap_memmaps;
1020 |     }
1021 | 
1022 |     LazyCSV_Cache* _cache = malloc(sizeof(LazyCSV_Cache));
1023 |     _cache->empty = PyBytes_FromString("");
1024 |     _cache->items = malloc((UCHAR_MAX+1)*sizeof(PyObject*));
1025 | 
1026 |     for (size_t i = 0; i <= UCHAR_MAX; i++)
1027 |         _cache->items[i] = PyBytes_FromFormat("%c", (int)i);
1028 | 
1029 |     LazyCSV_File* _commas = malloc(sizeof(LazyCSV_File));
1030 |     _commas->name = comma_index;
1031 |     _commas->data = comma_memmap;
1032 |     _commas->st = comma_st;
1033 |     _commas->fd = comma_file;
1034 | 
1035 |     LazyCSV_File* _anchors = malloc(sizeof(LazyCSV_File));
1036 |     _anchors->name = anchor_index;
1037 |     _anchors->data = anchor_memmap;
1038 |     _anchors->st = anchor_st;
1039 |     _anchors->fd = anchor_file;
1040 | 
1041 |     LazyCSV_File* _newlines = malloc(sizeof(LazyCSV_File));
1042 |     _newlines->name = newline_index;
1043 |     _newlines->data = newline_memmap;
1044 |     _newlines->st = newline_st;
1045 |     _newlines->fd = newline_file;
1046 | 
1047 |     LazyCSV_Index* _index = malloc(sizeof(LazyCSV_Index));
1048 | 
1049 |     _index->dir = tempdir;
1050 |     _index->commas = _commas;
1051 |     _index->newlines = _newlines;
1052 |     _index->anchors = _anchors;
1053 | 
1054 |     LazyCSV_File* _data = malloc(sizeof(LazyCSV_File));
1055 |     _data->name = fullname;
1056 |     _data->fd = ufd;
1057 |     _data->data = file;
1058 |     _data->st = ust;
1059 | 
1060 |     self->rows = rows;
1061 |     self->cols = cols;
1062 |     self->name = fullname_obj;
1063 |     self->headers = headers;
1064 |     self->_skip_headers = skip_headers;
1065 |     self->_unquote = unquote;
1066 |     self->_quotechar = *quotechar;
1067 |     self->_newline = newline;
1068 |     self->_index = _index;
1069 |     self->_data = _data;
1070 |     self->_cache = _cache;
1071 | 
1072 |     return (PyObject*)self;
1073 | 
1074 | unmap_memmaps:
1075 |     munmap(comma_memmap, comma_st.st_size);
1076 |     munmap(anchor_memmap, anchor_st.st_size);
1077 |     munmap(newline_memmap, newline_st.st_size);
1078 |     Py_DECREF(headers);
1079 | 
1080 | close_newline_file:
1081 |     close(newline_file);
1082 | 
1083 | close_anchor_file:
1084 |     close(anchor_file);
1085 | 
1086 | close_comma_file:
1087 |     close(comma_file);
1088 |     munmap(file, ust.st_size);
1089 |     Py_XDECREF(tempdir);
1090 | 
1091 | close_ufd:
1092 |     close(ufd);
1093 |     return NULL;
1094 | }
1095 | 
1096 | 
1097 | static void LazyCSV_Destruct(LazyCSV* self) {
1098 |     munmap(self->_data->data, self->_data->st.st_size);
1099 |     munmap(self->_index->commas->data, self->_index->commas->st.st_size);
1100 |     munmap(self->_index->anchors->data, self->_index->anchors->st.st_size);
1101 |     munmap(self->_index->newlines->data, self->_index->newlines->st.st_size);
1102 | 
1103 |     close(self->_data->fd);
1104 |     close(self->_index->commas->fd);
1105 |     close(self->_index->anchors->fd);
1106 |     close(self->_index->newlines->fd);
1107 | 
1108 |     remove(self->_index->commas->name);
1109 |     remove(self->_index->anchors->name);
1110 |     remove(self->_index->newlines->name);
1111 | 
1112 |     free(self->_index->commas->name);
1113 |     free(self->_index->anchors->name);
1114 |     free(self->_index->newlines->name);
1115 | 
1116 |     free(self->_index->commas);
1117 |     free(self->_index->anchors);
1118 |     free(self->_index->newlines);
1119 | 
1120 |     Py_XDECREF(self->_index->dir);
1121 | 
1122 |     Py_DECREF(self->_cache->empty);
1123 |     for (size_t i = 0; i <= UCHAR_MAX; i++)
1124 |         Py_DECREF(self->_cache->items[i]);
1125 |     free(self->_cache->items);
1126 | 
1127 |     free(self->_data);
1128 |     free(self->_index);
1129 |     free(self->_cache);
1130 | 
1131 |     Py_DECREF(self->headers);
1132 |     Py_DECREF(self->name);
1133 | 
1134 |     Py_TYPE(self)->tp_free((PyObject*)self);
1135 | }
1136 | 
1137 | 
1138 | static PyObject *LazyCSV_Seq(PyObject *self, PyObject *args, PyObject *kwargs) {
1139 | 
1140 |     size_t row = SIZE_MAX;
1141 |     size_t col = SIZE_MAX;
1142 |     size_t stop;
1143 |     char reversed;
1144 | 
1145 |     static char *kwlist[] = {"row", "col", "reversed", NULL};
1146 | 
1147 |     char ok = PyArg_ParseTupleAndKeywords(
1148 |         args, kwargs, "|nnb", kwlist, &row, &col, &reversed
1149 |     );
1150 | 
1151 |     if (!ok) {
1152 |         PyErr_SetString(
1153 |             PyExc_ValueError,
1154 |             "unable to parse lazy.sequence() arguments"
1155 |         );
1156 |         return NULL;
1157 |     }
1158 | 
1159 |     if (row == SIZE_MAX && col == SIZE_MAX) {
1160 |         PyErr_SetString(
1161 |             PyExc_ValueError,
1162 |             "a row or a col value is required"
1163 |         );
1164 |         return NULL;
1165 |     }
1166 | 
1167 |     if (row != SIZE_MAX && col != SIZE_MAX) {
1168 |         PyErr_SetString(
1169 |             PyExc_ValueError,
1170 |             "cannot specify both row and col"
1171 |         );
1172 |         return NULL;
1173 |     }
1174 | 
1175 |     if (col != SIZE_MAX) {
1176 |         stop = ((LazyCSV*)self)->rows;
1177 |     }
1178 |     else if (row != SIZE_MAX) {
1179 |         stop = ((LazyCSV*)self)->cols;
1180 |     }
1181 |     else {
1182 |         PyErr_SetString(
1183 |             PyExc_RuntimeError,
1184 |             "could not determine axis for materialization"
1185 |         );
1186 |         return NULL;
1187 |     }
1188 | 
1189 |     PyTypeObject* type = &LazyCSV_IterType;
1190 |     LazyCSV_Iter* iter = (LazyCSV_Iter*)type->tp_alloc(type, 0);
1191 | 
1192 |     if (!iter) {
1193 |         PyErr_SetString(
1194 |             PyExc_MemoryError,
1195 |             "unable to allocate memory for iterable"
1196 |         );
1197 |         return NULL;
1198 |     }
1199 | 
1200 |     iter->row = row;
1201 |     iter->col = col;
1202 |     iter->reversed = reversed;
1203 |     iter->position = 0;
1204 |     iter->step = 1;
1205 |     iter->stop = stop;
1206 |     iter->lazy = self;
1207 | 
1208 |     Py_INCREF(self);
1209 | 
1210 |     return (PyObject*)iter;
1211 | }
1212 | 
1213 | 
1214 | static PyObject* LazyCSV_GetValue(PyObject* self, PyObject* r, PyObject* c) {
1215 | 
1216 |     Py_ssize_t _row = PyLong_AsSsize_t(r);
1217 |     Py_ssize_t _col = PyLong_AsSsize_t(c);
1218 | 
1219 |     LazyCSV* lazy = (LazyCSV*)self;
1220 | 
1221 |     size_t row = _row < 0 ? lazy->rows + _row : (size_t)_row;
1222 |     size_t col = _col < 0 ? lazy->cols + _col : (size_t)_col;
1223 | 
1224 |     int row_in_bounds = (
1225 |         0 <= row && row < lazy->rows
1226 |     );
1227 | 
1228 |     int col_in_bounds = (
1229 |         0 <= col && col < lazy->cols
1230 |     );
1231 | 
1232 |     if (!row_in_bounds || !col_in_bounds) {
1233 |         PyErr_SetString(
1234 |             PyExc_ValueError,
1235 |             "provided value not in bounds of index"
1236 |         );
1237 |         return NULL;
1238 |     }
1239 | 
1240 |     row += !lazy->_skip_headers;
1241 | 
1242 |     char* newlines = lazy->_index->newlines->data;
1243 |     char* anchors = lazy->_index->anchors->data;
1244 |     char* commas = lazy->_index->commas->data;
1245 | 
1246 |     LazyCSV_RowIndex* ridx =
1247 |         (LazyCSV_RowIndex*)
1248 |         (newlines + row*sizeof(LazyCSV_RowIndex));
1249 | 
1250 |     char* aidx = anchors+ridx->index;
1251 |     char* cidx = commas+((lazy->cols+1)*row*sizeof(INDEX_DTYPE));
1252 | 
1253 |     size_t cs = LazyCSV_ValueFromIndex((size_t)col, ridx, cidx, aidx);
1254 |     size_t ce = LazyCSV_ValueFromIndex((size_t)col + 1, ridx, cidx, aidx);
1255 | 
1256 |     size_t len = ce - cs - 1;
1257 | 
1258 |     return PyBytes_FromOffsetAndLen(lazy, cs, len);
1259 | }
1260 | 
1261 | 
1262 | static PyObject* LazyCSV_GetItem(PyObject* self, PyObject* key) {
1263 |     if (!PyTuple_Check(key)) {
1264 |         PyErr_SetString(
1265 |             PyExc_ValueError,
1266 |             "index must contain both a row and column value"
1267 |         );
1268 |         return NULL;
1269 |     }
1270 | 
1271 |     PyObject *row_obj, *col_obj;
1272 | 
1273 |     if (!PyArg_ParseTuple(key, "OO", &row_obj, &col_obj)) {
1274 |         PyErr_SetString(
1275 |             PyExc_RuntimeError,
1276 |             "unable to parse index key"
1277 |         );
1278 |         return NULL;
1279 |     }
1280 | 
1281 |     if (PyLong_Check(row_obj) && PyLong_Check(col_obj))
1282 |         return LazyCSV_GetValue(self, row_obj, col_obj);
1283 | 
1284 |     int row_is_slice = PySlice_Check(row_obj);
1285 |     int col_is_slice = PySlice_Check(col_obj);
1286 | 
1287 |     LazyCSV* lazy = (LazyCSV*)self;
1288 | 
1289 |     if (row_is_slice && !col_is_slice) {
1290 |         PySliceObject* row_slice = (PySliceObject*)row_obj;
1291 | 
1292 |         Py_ssize_t _col = PyLong_AsSsize_t(col_obj);
1293 |         size_t col = _col < 0 ? lazy->cols + _col : (size_t)_col;
1294 | 
1295 |         int col_in_bounds = (
1296 |             0 <= col && col < lazy->cols
1297 |         );
1298 | 
1299 |         if (!col_in_bounds) goto boundary_err;
1300 | 
1301 |         Py_ssize_t _start = row_slice->start == Py_None
1302 |                                 ? (Py_ssize_t)0
1303 |                                 : PyLong_AsSsize_t(row_slice->start);
1304 |         Py_ssize_t _stop = row_slice->stop == Py_None
1305 |                                ? (Py_ssize_t)lazy->rows
1306 |                                : PyLong_AsSsize_t(row_slice->stop);
1307 |         Py_ssize_t _step =
1308 |             row_slice->step == Py_None ? 1 : PyLong_AsSsize_t(row_slice->step);
1309 | 
1310 |         size_t start = _start < 0 ? lazy->rows + _start : (size_t)_start;
1311 |         size_t stop = _stop < 0 ? lazy->rows + _stop : (size_t)_stop;
1312 | 
1313 |         size_t step;
1314 |         char reversed = 0;
1315 | 
1316 |         if (_step < 0) {
1317 |             reversed = 1;
1318 |             step = (size_t)(-1 * _step);
1319 |             if (row_slice->start != Py_None) {
1320 |                 start = lazy->rows - start - 1;
1321 |             }
1322 |             if (row_slice->stop != Py_None) {
1323 |                 stop = lazy->rows - stop - 1;
1324 |             }
1325 |         }
1326 |         else {
1327 |             step = (size_t)_step;
1328 |         }
1329 | 
1330 |         PyTypeObject* type = &LazyCSV_IterType;
1331 |         LazyCSV_Iter* iter = (LazyCSV_Iter*)type->tp_alloc(type, 0);
1332 |         if (!iter) goto memory_err;
1333 | 
1334 |         iter->row = SIZE_MAX;
1335 |         iter->col = col;
1336 |         iter->reversed = reversed;
1337 |         iter->position = start;
1338 |         iter->step = step;
1339 |         iter->stop = stop;
1340 |         iter->lazy = self;
1341 |         Py_INCREF(self);
1342 | 
1343 |         return (PyObject*)iter;
1344 |     }
1345 | 
1346 |     if (col_is_slice && !row_is_slice) {
1347 |         PySliceObject* col_slice = (PySliceObject*)col_obj;
1348 | 
1349 |         Py_ssize_t _row = PyLong_AsSsize_t(row_obj);
1350 |         size_t row = _row < 0 ? lazy->rows + _row : (size_t)_row;
1351 | 
1352 |         int row_in_bounds = (
1353 |             0 <= row && row < lazy->rows
1354 |         );
1355 | 
1356 |         if (!row_in_bounds) goto boundary_err;
1357 | 
1358 |         Py_ssize_t _start = col_slice->start == Py_None
1359 |                                 ? (Py_ssize_t)0
1360 |                                 : PyLong_AsSsize_t(col_slice->start);
1361 |         Py_ssize_t _stop = col_slice->stop == Py_None
1362 |                                ? (Py_ssize_t)lazy->cols
1363 |                                : PyLong_AsSsize_t(col_slice->stop);
1364 |         Py_ssize_t _step =
1365 |             col_slice->step == Py_None ? 1 : PyLong_AsSsize_t(col_slice->step);
1366 | 
1367 |         size_t start = _start < 0 ? lazy->cols + _start : (size_t)_start;
1368 |         size_t stop = _stop < 0 ? lazy->cols + _stop : (size_t)_stop;
1369 | 
1370 |         size_t step;
1371 |         char reversed = 0;
1372 | 
1373 |         if (_step < 0) {
1374 |             step = (size_t)(-1 * _step);
1375 |             reversed = 1;
1376 |             if (col_slice->start != Py_None) {
1377 |                 start = lazy->cols - start - 1;
1378 |             }
1379 |             if (col_slice->stop != Py_None) {
1380 |                 stop = lazy->cols - stop - 1;
1381 |             }
1382 |         }
1383 |         else {
1384 |             step = (size_t)_step;
1385 |         }
1386 | 
1387 |         PyTypeObject* type = &LazyCSV_IterType;
1388 |         LazyCSV_Iter* iter = (LazyCSV_Iter*)type->tp_alloc(type, 0);
1389 |         if (!iter) goto memory_err;
1390 | 
1391 |         iter->row = row;
1392 |         iter->col = SIZE_MAX;
1393 |         iter->reversed = reversed;
1394 |         iter->position = start;
1395 |         iter->step = step;
1396 |         iter->stop = stop;
1397 |         iter->lazy = self;
1398 |         Py_INCREF(self);
1399 | 
1400 |         return (PyObject*)iter;
1401 |     }
1402 | 
1403 |     goto schema_err;
1404 | 
1405 | schema_err:
1406 |     PyErr_SetString(
1407 |         PyExc_ValueError,
1408 |         "given indexing schema is not supported"
1409 |     );
1410 |     return NULL;
1411 | 
1412 | memory_err:
1413 |     PyErr_SetString(
1414 |         PyExc_MemoryError,
1415 |         "unable to allocate memory for iterable"
1416 |     );
1417 |     return NULL;
1418 | 
1419 | boundary_err:
1420 |     PyErr_SetString(
1421 |         PyExc_ValueError,
1422 |         "provided value not in bounds of index"
1423 |     );
1424 |     return NULL;
1425 | }
1426 | 
1427 | 
1428 | static PyMemberDef LazyCSV_Members[] = {
1429 |     {"headers", T_OBJECT, offsetof(LazyCSV, headers), READONLY, "header tuple"},
1430 |     {"rows", T_LONG, offsetof(LazyCSV, rows), READONLY, "row length"},
1431 |     {"cols", T_LONG, offsetof(LazyCSV, cols), READONLY, "col length"},
1432 |     {"name", T_OBJECT, offsetof(LazyCSV, name), READONLY, "file name"},
1433 |     {NULL, }
1434 | };
1435 | 
1436 | 
1437 | static PyMethodDef LazyCSV_Methods[] = {
1438 |     {
1439 |         "sequence",
1440 |         (PyCFunction)LazyCSV_Seq,
1441 |         METH_VARARGS|METH_KEYWORDS,
1442 |         "get column iterator"
1443 |     },
1444 |     {NULL, }
1445 | };
1446 | 
1447 | static PyMappingMethods LazyCSV_MappingMembers[] = {
1448 |     (lenfunc)NULL,
1449 |     (binaryfunc)LazyCSV_GetItem,
1450 |     (objobjargproc)NULL,
1451 | };
1452 | 
1453 | PyDoc_STRVAR(
1454 |     LazyCSV_Docstring,
1455 |     "lazycsv.LazyCSV(\n"
1456 |     "    filepath,\n"
1457 |     "    /\n"
1458 |     "    delimiter: str=',',\n"
1459 |     "    quotechar: str='\"',\n"
1460 |     "    unquote: bool=True,\n"
1461 |     "    skip_headers: bool=False,\n"
1462 |     "    buffer_size: int=2**21,\n"
1463 |     "    index_dir: str=None,\n"
1464 |     ")\n"
1465 |     "\n"
1466 |     "LazyCSV object constructor. Takes the filepath of a CSV\n"
1467 |     "file as the first argument, and several keyword arguments\n"
1468 |     "as optional values. Indexes the CSV, generates headers,\n"
1469 |     "and returns `self` to the caller."
1470 |     "\n\n"
1471 |     "Options\n"
1472 |     "-------\n"
1473 |     "delimiter: ',' -- character used to demarcate the separation\n"
1474 |     "    of two fields. Should only be a single char.\n"
1475 |     "quotechar: '\"' -- character used to ensure contents belong\n"
1476 |     "    to a single field. Should only be a single char.\n"
1477 |     "unquote: bool=True -- if True, a quoted field will be\n"
1478 |     "    stripped of quotes on parsing. i.e. `,\"goo\\nbar\",`\n"
1479 |     "    will return 'goo\\nbar'.\n"
1480 |     "skip_headers: bool=False -- skips parsing out header\n"
1481 |     "    values to the .header attribute if True.\n"
1482 |     "buffer_size: int=2**21 -- is the buffer size that LazyCSV\n"
1483 |     "    uses when writing index data to disk during object\n"
1484 |     "    construction, can be set to any value greater than 0\n"
1485 |     "    (units of bytes).\n"
1486 |     "index_dir: str=None -- Directory where index files\n"
1487 |     "    are saved. By default uses Python's `TemporaryDirectory()`\n"
1488 |     "    function in the `tempfile` module.\n"
1489 |     "\n"
1490 |     "Returns\n"
1491 |     "-------\n"
1492 |     "self\n"
1493 | 
1494 | );
1495 | 
1496 | 
1497 | static PyTypeObject LazyCSVType = {
1498 |     PyVarObject_HEAD_INIT(NULL, 0)
1499 |     .tp_name = "lazycsv.LazyCSV",
1500 |     .tp_doc = LazyCSV_Docstring,
1501 |     .tp_basicsize = sizeof(LazyCSV),
1502 |     .tp_dealloc = (destructor)LazyCSV_Destruct,
1503 |     .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
1504 |     .tp_methods = LazyCSV_Methods,
1505 |     .tp_members = LazyCSV_Members,
1506 |     .tp_as_mapping = LazyCSV_MappingMembers,
1507 |     .tp_new = LazyCSV_New,
1508 | };
1509 | 
1510 | 
1511 | static PyModuleDef LazyCSVModule = {
1512 |     PyModuleDef_HEAD_INIT,
1513 |     "lazycsv",
1514 |     "module for custom lazycsv object",
1515 |     -1,
1516 |     NULL
1517 | };
1518 | 
1519 | 
1520 | PyMODINIT_FUNC PyInit_lazycsv() {
1521 | #if INCLUDE_NUMPY
1522 |     import_array();
1523 | #endif
1524 |     if (PyType_Ready(&LazyCSVType) < 0)
1525 |         return NULL;
1526 | 
1527 |     if (PyType_Ready(&LazyCSV_IterType) < 0)
1528 |         return NULL;
1529 | 
1530 |     PyObject* module = PyModule_Create(&LazyCSVModule);
1531 |     if (!module)
1532 |         return NULL;
1533 | 
1534 |     Py_INCREF(&LazyCSVType);
1535 |     PyModule_AddObject(module, "LazyCSV", (PyObject*)&LazyCSVType);
1536 |     return module;
1537 | }
1538 | 
1539 | 


--------------------------------------------------------------------------------
/tests/benchmark_lazy.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import itertools
  3 | import os.path
  4 | import tempfile
  5 | 
  6 | from time import perf_counter
  7 | 
  8 | 
  9 | def get_size(file_path, unit="bytes"):
 10 |     file_size = os.path.getsize(file_path)
 11 |     exponents_map = {"bytes": 0, "kb": 1, "mb": 2, "gb": 3}
 12 |     if unit not in exponents_map:
 13 |         raise ValueError("Must select from ['bytes', 'kb', 'mb', 'gb']")
 14 |     else:
 15 |         size = file_size / 1024 ** exponents_map[unit]
 16 |         return round(size, 3)
 17 | 
 18 | 
 19 | def run_lazy(fpath):
 20 |     from lazycsv import lazycsv
 21 |     print("indexing lazy... ", end="\r")
 22 |     ti = perf_counter()
 23 |     lazy = lazycsv.LazyCSV(fpath)
 24 |     te = perf_counter()
 25 |     print(f"indexing lazy... time to index: {te-ti}")
 26 |     for c in range(lazy.cols):
 27 |         col = list(lazy.sequence(col=c))
 28 |         if c % 100 == 0:
 29 |             print(f"parsing cols... {c}/{lazy.cols}", end="\r")
 30 |         del col
 31 |     tf = perf_counter()
 32 |     print(f"parsing cols... time to parse: {tf-te}", end="\r")
 33 |     del lazy
 34 |     print(f"\ntotal time: {tf-ti}")
 35 | 
 36 | 
 37 | def run_sqlite(fpath):
 38 |     import sqlite3, csv
 39 |     tempdir = tempfile.TemporaryDirectory()
 40 |     print("creating database... ", end="\r")
 41 |     ti = perf_counter()
 42 |     conn = sqlite3.connect(os.path.join(tempdir.name, "data.db"))
 43 |     cur = conn.cursor()
 44 |     headers, sql, chunks = [], "", []
 45 |     with open(fpath, 'r') as f:
 46 |         reader = csv.reader(f)
 47 |         for r in reader:
 48 |             if not headers:
 49 |                 headers = ', '.join(r)
 50 |                 cur.execute(f"CREATE TABLE t ({headers});")
 51 |                 conn.commit()
 52 |                 sql = f"INSERT INTO t ({headers}) VALUES ({', '.join('?'*len(r))});"
 53 |                 headers = r
 54 |             chunks.append(r)
 55 |             if len(chunks) > 10000:
 56 |                 cur.executemany(sql, chunks)
 57 |                 conn.commit()
 58 |                 chunks.clear()
 59 |     if chunks:
 60 |         cur.executemany(sql, chunks)
 61 |         conn.commit()
 62 |     te = perf_counter()
 63 |     print(f"creating database... time to db: {te-ti}")
 64 |     for i, c in enumerate(headers):
 65 |         sql = f"SELECT ({c}) from t;"
 66 |         # col = tuple(cur.execute(sql))
 67 |         col = list(itertools.chain(*cur.execute(sql)))
 68 |         if i % 100 == 0:
 69 |             print(f"parsing cols... {i}/{len(headers)}", end="\r")
 70 |         del col
 71 |     tf = perf_counter()
 72 |     print(f"parsing cols... time to parse: {tf-te}")
 73 |     cur.close()
 74 |     conn.close()
 75 |     print(f"\ntotal time: {tf-ti}")
 76 | 
 77 | 
 78 | def run_datatable(fpath):
 79 |     import datatable as dt
 80 |     print("creating datatables frame...", end="\r")
 81 |     ti = perf_counter()
 82 |     frame = dt.fread(fpath)
 83 |     te = perf_counter()
 84 |     print(f"creating datatables frame... time to object: {te-ti}")
 85 |     for c in range(frame.ncols):
 86 |         col = frame[c].to_list()
 87 |         if c % 100 == 0:
 88 |             print(f"parsing cols... {c}/{frame.ncols}", end="\r")
 89 |         del col
 90 |     tf = perf_counter()
 91 |     print(f"parsing cols... time to parse: {tf-te}", end="\r")
 92 |     del frame
 93 |     print(f"\ntotal time: {tf-ti}")
 94 | 
 95 | 
 96 | def run_pandas(fpath):
 97 |     import pandas as pd
 98 |     print("creating pandas dataframe...", end="\r")
 99 |     ti = perf_counter()
100 |     df = pd.read_csv(fpath)
101 |     te = perf_counter()
102 |     print(f"creating pandas dataframe... time to object: {te-ti}")
103 |     for i, c in enumerate(df.columns):
104 |         col = df[c]
105 |         if i % 100 == 0:
106 |             print(f"parsing col: {c}", end="\r")
107 |         del col
108 |     te = perf_counter()
109 |     del df
110 |     print(f"\ntotal time: {te-ti}")
111 | 
112 | 
113 | def run_pyarrow(fpath):
114 |     from pyarrow import csv as pa_csv
115 |     print("creating pyarrow table...", end="\r")
116 |     ti = perf_counter()
117 |     table = pa_csv.read_csv(fpath)
118 |     te = perf_counter()
119 |     print(f"creating pyarrow table... time to object: {te-ti}")
120 |     for c in range(table.num_columns):
121 |         col = table[c].to_pylist()
122 |         if c % 100 == 0:
123 |             print(f"parsing cols... {c}/{table.num_columns}", end="\r")
124 |         del col
125 |     tf = perf_counter()
126 |     print(f"parsing cols... time to parse: {tf-te}", end="\r")
127 |     del table
128 |     print(f"\ntotal time: {tf-ti}")
129 | 
130 | 
131 | def run_polars_read(fpath):
132 |     import polars as pl
133 |     print("creating polars df...", end="\r")
134 |     ti = perf_counter()
135 |     table = pl.read_csv(fpath)
136 |     te = perf_counter()
137 |     print(f"creating polars df... time to object: {te-ti}")
138 |     for c in range(table.shape[1]):
139 |         col = table[:, c].to_list()
140 |         if c % 100 == 0:
141 |             print(f"parsing cols... {c}/{table.shape[1]}", end="\r")
142 |         del col
143 |     tf = perf_counter()
144 |     print(f"parsing cols... time to parse: {tf-te}", end="\r")
145 |     del table
146 |     print(f"\ntotal time: {tf-ti}")
147 | 
148 | 
149 | def run_polars_scan(fpath):
150 |     import polars as pl
151 |     print("creating polars df...", end="\r")
152 |     ti = perf_counter()
153 |     table = pl.scan_csv(fpath, rechunk=False)
154 |     te = perf_counter()
155 |     print(f"creating polars df... time to object: {te-ti}")
156 |     cols = len(table.columns)
157 |     for i, c in enumerate(table.columns):
158 |         col = tuple(
159 |             table
160 |             .select(c)
161 |             .collect()
162 |             .to_dict(as_series=False)
163 |             .values()
164 |         )
165 |         if i % 100 == 0:
166 |             print(f"parsing cols... {i}/{cols}", end="\r")
167 |         del col
168 |     tf = perf_counter()
169 |     print(f"parsing cols... time to parse: {tf-te}", end="\r")
170 |     del table
171 |     print(f"\ntotal time: {tf-ti}")
172 | 
173 | 
174 | def main():
175 |     cols = 5000
176 |     rows = 50000
177 |     sparsity = 0.50
178 |     benchmarks = {
179 |         "lazycsv": run_lazy,
180 |         # "pandas": run_pandas,
181 |         # "pyarrow": run_pyarrow,
182 |         # "datatable": run_datatable,
183 |         # "polars (read)": run_polars_read,
184 |         # "polars (scan)": run_polars_scan,
185 |         # "sqlite": run_sqlite
186 |     }
187 |     filename = f"benchmark_{rows}r_{cols}c_{int(sparsity*100)}%.csv"
188 |     HERE = os.path.abspath(os.path.dirname(__file__))
189 |     dir = os.path.join(HERE, f"fixtures/benchmarks")
190 |     os.makedirs(dir, exist_ok=True)
191 |     filepath = os.path.join(dir, filename)
192 |     if os.path.isfile(filepath):
193 |         name = filepath
194 |         tempf = None
195 |     else:
196 |         tempf = tempfile.NamedTemporaryFile()
197 |         headers = ",".join(f"col_{i}" for i in range(cols)) + "\n"
198 |         tempf.write(headers.encode("utf8"))
199 |         i = 0
200 |         for i in range(rows):
201 |             row = ",".join(
202 |                 f"{i}x{j}" if random.random() > sparsity else "" for j in range(cols)
203 |             )
204 |             tempf.write((row + "\n").encode("utf8"))
205 |             del row
206 |             if i % 100 == 0:
207 |                 print(f"writing rows: {i}/{rows}", end="\r")
208 |         print(f"writing rows: {i}/{rows}")
209 |         tempf.flush()
210 |         name = tempf.name
211 |         if input("copy to benchmarks? [Y/n]: ") in {"Y", ""}:
212 |             __import__("shutil").copyfile(tempf.name, filepath)
213 |     path = os.path.abspath(name)
214 |     print(f"filesize: {get_size(name, 'gb')}gb")
215 |     print(f"{cols=}")
216 |     print(f"{rows=}")
217 |     print(f"{sparsity=}")
218 |     for name, fn in benchmarks.items():
219 |         print(f"\nbenchmarking {name}:")
220 |         fn(path)
221 |     if tempf:
222 |         tempf.close()
223 | 
224 | 
225 | if __name__ == "__main__":
226 |     main()
227 | 
228 | 


--------------------------------------------------------------------------------
/tests/fixtures/file.csv:
--------------------------------------------------------------------------------
1 | ,ALPHA,BETA
2 | 0,a0,b0
3 | 1,a1,b1
4 | 


--------------------------------------------------------------------------------
/tests/fixtures/file_crlf.csv:
--------------------------------------------------------------------------------
1 | ,A,B
2 | 0,a0,b0
3 | 


--------------------------------------------------------------------------------
/tests/fixtures/file_crlf2.csv:
--------------------------------------------------------------------------------
1 | ,"This,that","Fizz,Buzz"
2 | 0,"Goo,Bar","Bizz,Bazz"


--------------------------------------------------------------------------------
/tests/fixtures/file_delimiter_and_quotechar.csv:
--------------------------------------------------------------------------------
1 | INDEX	ATTR
2 | 0	|A|
3 | 1	|B|
4 | 


--------------------------------------------------------------------------------
/tests/fixtures/file_empty.csv:
--------------------------------------------------------------------------------
1 | ,ALPHA,BETA
2 | ,,
3 | ,,
4 | 


--------------------------------------------------------------------------------
/tests/fixtures/file_newline.csv:
--------------------------------------------------------------------------------
1 | ,"This,that
2 | ","Fizz,Buzz"
3 | 0,"Goo,Bar
4 | ","Bizz,Bazz"
5 | 


--------------------------------------------------------------------------------
/tests/script_lazycsv.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from lazycsv import lazycsv
 4 | 
 5 | file = sys.argv[1]
 6 | 
 7 | lazy = lazycsv.LazyCSV(file)
 8 | 
 9 | data = [
10 |     list(lazy[:, i])
11 |     for i in range(lazy.cols)
12 | ]
13 | 
14 | 


--------------------------------------------------------------------------------
/tests/test_lazycsv.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import csv
  3 | import os
  4 | import os.path
  5 | import tempfile
  6 | import textwrap
  7 | 
  8 | from lazycsv import lazycsv
  9 | 
 10 | import numpy as np
 11 | import pytest
 12 | 
 13 | HERE = os.path.abspath(os.path.dirname(__file__))
 14 | FPATH = os.path.join(HERE, "fixtures/file.csv")
 15 | 
 16 | INDEX_COLLECTION = [None, *range(-9, 0), *range(1, 10)]
 17 | 
 18 | SLICE_INDEXES = [
 19 |     (a, b, c)
 20 |     for a in INDEX_COLLECTION
 21 |     for b in INDEX_COLLECTION
 22 |     for c in INDEX_COLLECTION
 23 | ]
 24 | 
 25 | 
 26 | @pytest.fixture
 27 | def lazy():
 28 |     lazy = lazycsv.LazyCSV(FPATH)
 29 |     yield lazy
 30 | 
 31 | 
 32 | @pytest.fixture
 33 | def file_1000r_1000c():
 34 |     tempf = tempfile.NamedTemporaryFile()
 35 |     cols, rows = 1000, 1000
 36 |     headers = ",".join("col_{i}".format_map(dict(i=i)) for i in range(cols)) + "\n"
 37 |     tempf.write(headers.encode("utf8"))
 38 |     for _ in range(rows):
 39 |         row = ",".join("{j}".format_map(dict(j=j)) for j in range(cols)) + "\n"
 40 |         tempf.write(row.encode("utf8"))
 41 |     tempf.flush()
 42 |     yield tempf
 43 |     tempf.close()
 44 | 
 45 | 
 46 | @contextlib.contextmanager
 47 | def prepped_file(actual):
 48 |     tempf = tempfile.NamedTemporaryFile()
 49 |     tempf.write(actual)
 50 |     tempf.flush()
 51 |     yield tempf
 52 |     tempf.close()
 53 | 
 54 | 
 55 | def test_demo():
 56 |     actual = b"INDEX,A,B\n0,,2\n,,5"
 57 |     with prepped_file(actual) as tempf:
 58 |         lazy = lazycsv.LazyCSV(tempf.name)
 59 |         data = tuple(tuple(lazy.sequence(col=c)) for c in range(lazy.cols))
 60 |     assert data == ((b"0", b""), (b"", b""), (b"2", b"5"))
 61 | 
 62 | 
 63 | class TestLazyCSV:
 64 |     def test_attributes(self):
 65 |         lazy = lazycsv.LazyCSV(b"../tests/fixtures/file.csv")
 66 |         assert lazy.name == os.path.abspath(FPATH).encode()
 67 |         assert lazy.headers == (b"", b"ALPHA", b"BETA")
 68 | 
 69 |     def test_bad_file_arg(self):
 70 |         with pytest.raises(ValueError) as err:
 71 |             _ = lazycsv.LazyCSV(1)
 72 |         (_str,) = err.value.args
 73 |         assert _str == "first argument must be str or bytes"
 74 | 
 75 |     def test_more_headers(self):
 76 |         actual = b"INDEX,,AA,B,CC,D,EE\n0,1,2,3,4,5,6\n"
 77 |         with prepped_file(actual) as tempf:
 78 |             lazy = lazycsv.LazyCSV(tempf.name)
 79 |         assert lazy.headers == (b"INDEX", b"", b"AA", b"B", b"CC", b"D", b"EE")
 80 | 
 81 |     def test_headers_empty_index(self):
 82 |         actual = b",AA,B,CC,D,EE\n0,1,2,3,4,\n"
 83 |         with prepped_file(actual) as tempf:
 84 |             lazy = lazycsv.LazyCSV(tempf.name)
 85 |         assert lazy.headers == (b"", b"AA", b"B", b"CC", b"D", b"EE")
 86 | 
 87 |     def test_initial_parse(self, lazy):
 88 |         assert lazy.rows, lazy.cols == (2, 3)
 89 | 
 90 |     def test_initial_parse_skip_headers(self):
 91 |         lazy = lazycsv.LazyCSV(FPATH, skip_headers=True)
 92 |         assert lazy.rows, lazy.cols == (3, 3)
 93 |         assert lazy.headers == ()
 94 | 
 95 |     def test_get_column(self, lazy):
 96 |         actual = list(lazy.sequence(col=0))
 97 |         assert actual == [b"0", b"1"]
 98 |         actual = list(lazy.sequence(col=1))
 99 |         assert actual == [b"a0", b"a1"]
100 |         actual = list(lazy.sequence(col=2))
101 |         assert actual == [b"b0", b"b1"]
102 | 
103 |     def test_get_column_slice(self, lazy):
104 |         actual = list(lazy[:, 1])
105 |         assert actual == [b"a0", b"a1"]
106 |         with pytest.raises(ValueError) as err:
107 |             _ = list(lazy[:, -5])
108 |         assert err.value.args == ("provided value not in bounds of index",)
109 | 
110 |     def test_get_col_slice_variety(self, lazy):
111 |         actual = b"INDEX\n0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n"
112 |         with prepped_file(actual) as tempf:
113 |             lazy = lazycsv.LazyCSV(tempf.name)
114 |             for indexes in SLICE_INDEXES:
115 |                 _slice = slice(*indexes)
116 |                 expected = list(range(10))[_slice]
117 |                 actual = list(map(int, lazy[_slice, 0]))
118 |                 assert actual == expected
119 | 
120 |     def test_get_actual_col(self):
121 |         actual = b"INDEX,ATTR\n0,a\n1,b\n2,c\n3,d\n"
122 |         with prepped_file(actual) as tempf:
123 |             lazy = lazycsv.LazyCSV(tempf.name)
124 |             assert list(lazy.sequence(col=0)) == [b"0", b"1", b"2", b"3"]
125 |             assert list(lazy.sequence(col=1)) == [b"a", b"b", b"c", b"d"]
126 |             assert lazy.headers == (b"INDEX", b"ATTR")
127 |             assert lazy.rows, lazy.cols == (4, 2)
128 | 
129 |     def test_get_actual_col_skip_headers(self):
130 |         actual = b"INDEX,ATTR\n0,a\n1,b\n2,c\n3,d\n"
131 |         with prepped_file(actual) as tempf:
132 |             lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True)
133 |             assert list(lazy.sequence(col=0)) == [b"INDEX", b"0", b"1", b"2", b"3"]
134 |             assert list(lazy.sequence(col=1)) == [b"ATTR", b"a", b"b", b"c", b"d"]
135 |             assert lazy.headers == ()
136 |             assert lazy.rows, lazy.cols == (4, 2)
137 | 
138 |     def test_headless_actual_col(self):
139 |         actual = b"INDEX,ATTR\n0,a\n1,b\n"
140 |         with prepped_file(actual) as tempf:
141 |             lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True)
142 |             actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols))
143 | 
144 |         assert lazy.rows, lazy.cols == (3, 2)
145 |         assert actual == [[b"INDEX", b"0", b"1"], [b"ATTR", b"a", b"b"]]
146 |         assert lazy.headers == ()
147 | 
148 |     def test_get_row(self, lazy):
149 |         row_0 = list(lazy.sequence(row=0))
150 |         assert row_0 == [b"0", b"a0", b"b0"]
151 |         row_1 = list(lazy.sequence(row=1))
152 |         assert row_1 == [b"1", b"a1", b"b1"]
153 | 
154 |     def test_get_row_getitem(self, lazy):
155 |         row_0 = list(lazy[0, :])
156 |         assert row_0 == [b"0", b"a0", b"b0"]
157 |         with pytest.raises(ValueError) as err:
158 |             _ = list(lazy[-5, :])
159 |         assert err.value.args == ("provided value not in bounds of index",)
160 | 
161 |     def test_get_row_slice_variety(self):
162 |         actual = b"A,B,C,D,E,F,G,H,I,J\n0,1,2,3,4,5,6,7,8,9\n"
163 |         with prepped_file(actual) as tempf:
164 |             lazy = lazycsv.LazyCSV(tempf.name)
165 |             for indexes in SLICE_INDEXES:
166 |                 _slice = slice(*indexes)
167 |                 expected = list(range(10))[_slice]
168 |                 actual = list(map(int, lazy[0, _slice]))
169 |                 assert actual == expected
170 | 
171 |     def test_get_row_slice_skipped_headers(self):
172 |         actual = b"A,B,C,D,E,F,G,H,I,J\n0,1,2,3,4,5,6,7,8,9\n"
173 |         with prepped_file(actual) as tempf:
174 |             lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True)
175 |             for indexes in SLICE_INDEXES:
176 |                 _slice = slice(*indexes)
177 |                 expected = list(range(10))[_slice]
178 |                 actual = list(map(int, lazy[1, _slice]))
179 |                 assert actual == expected
180 | 
181 |     def test_empty_csv(self):
182 |         lazy = lazycsv.LazyCSV("fixtures/file_empty.csv")
183 |         actual = [list(lazy.sequence(col=i)) for i in range(lazy.cols)]
184 |         assert actual == [[b"", b""], [b"", b""], [b"", b""]]
185 | 
186 |     def test_headless_empty_csv(self):
187 |         actual = b",\n,\n,\n"
188 |         with prepped_file(actual) as tempf:
189 |             lazy = lazycsv.LazyCSV(tempf.name)
190 |             col1 = list(lazy.sequence(col=1))
191 |             col0 = list(lazy.sequence(col=0))
192 | 
193 |             actual = [col0, col1]
194 |             assert actual == [[b"", b""], [b"", b""]]
195 |             assert lazy.rows, lazy.cols == (2, 3)
196 | 
197 |     def test_empty_skipped_headers_csv(self):
198 |         actual = b",\n,\n,\n"
199 |         with prepped_file(actual) as tempf:
200 |             lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True)
201 |             actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols))
202 |         assert lazy.rows, lazy.cols == (3, 2)
203 |         assert actual == [[b"", b"", b""], [b"", b"", b""]]
204 |         assert lazy.headers == ()
205 | 
206 |     def test_getitem(self, lazy):
207 |         data = b",,\n0x0,0x1,0x2\n1x0,1x1,1x2\n2x0,2x1,2x2\n"
208 |         with prepped_file(data) as tempf:
209 |             lazy = lazycsv.LazyCSV(tempf.name)
210 |             assert lazy[0, 0] == lazy[-3, -3] == b"0x0"
211 |             assert lazy[1, 1] == lazy[-2, -2] == b"1x1"
212 |             assert lazy[2, 2] == lazy[-1, -1] == b"2x2"
213 |             with pytest.raises(ValueError) as err:
214 |                 lazy[3, 3]
215 |             assert ("provided value not in bounds of index",) == err.value.args
216 | 
217 |     def test_getitem_empty(self, lazy):
218 |         data = b",,\n0x0,0x1,0x2\n1x0,,1x2\n2x0,2x1,2x2\n"
219 |         with prepped_file(data) as tempf:
220 |             lazy = lazycsv.LazyCSV(tempf.name)
221 |             assert lazy[1, 1] == b""
222 | 
223 |     def test_getitem_skipped_headers(self):
224 |         data = b"0x0,0x1,0x2\n1x0,1x1,1x2\n2x0,2x1,2x2\n"
225 |         with prepped_file(data) as tempf:
226 |             lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True)
227 |             assert lazy[0, 0] == lazy[-3, -3] == b"0x0"
228 |             assert lazy[1, 1] == lazy[-2, -2] == b"1x1"
229 |             assert lazy[2, 2] == lazy[-1, -1] == b"2x2"
230 |             with pytest.raises(ValueError) as err:
231 |                 lazy[3, 3]
232 |             assert ("provided value not in bounds of index",) == err.value.args
233 | 
234 | 
235 | class TestLazyCSVIter:
236 |     def test_to_list(self, lazy):
237 |         assert lazy.sequence(col=0).to_list() == [b"0", b"1"]
238 |         assert lazy.sequence(row=1).to_list() == [b'1', b'a1', b'b1']
239 | 
240 |     def test_to_numpy(self):
241 |         actual = b"INDEX,ATTR\n0,a\n10,b\n100,c\n1000,d\n"
242 |         with prepped_file(actual) as tempf:
243 |             lazy = lazycsv.LazyCSV(tempf.name)
244 |             _iter = lazy.sequence(col=0)
245 |             if hasattr(_iter, "to_numpy"):
246 |                 arr = _iter.to_numpy()
247 |                 assert arr.tolist() == [b"0", b"10", b"100", b"1000"]
248 |                 assert (lazy
249 |                     .sequence(row=1)
250 |                     .to_numpy()
251 |                     .tolist()
252 |                 ) == [b'10', b'b']
253 |             else:
254 |                 raise RuntimeError(
255 |                     "test suite did not test numpy, recompile while setting"
256 |                     " LAZYCSV_INCLUDE_NUMPY=1 as an env variable to compile"
257 |                     " extension with numpy support."
258 |                 )
259 | 
260 | 
261 | class TestLazyCSVOptions:
262 |     def test_custom_quotechar_and_delimiter(self):
263 |         lazy = lazycsv.LazyCSV("fixtures/file_delimiter_and_quotechar.csv", quotechar="|", delimiter="\t")
264 |         actual = list(list(lazy.sequence(row=i)) for i in range(lazy.rows))
265 |         expected = [[b'0', b'A'], [b'1', b'B']]
266 |         assert actual == expected
267 | 
268 |     def test_custom_quotechar_unquote_false(self):
269 |         data = "INDEX,ATTR\n0,|A|\n1,|B|\n"
270 |         with prepped_file(data.encode()) as tempf:
271 |             lazy = lazycsv.LazyCSV(tempf.name, unquote=False, quotechar="|")
272 |             actual = list(list(lazy.sequence(row=i)) for i in range(lazy.rows))
273 |             expected = [[b'0', b'|A|'], [b'1', b'|B|']]
274 |             assert actual == expected
275 | 
276 |     def test_get_skipped_header_column(self):
277 |         lazy = lazycsv.LazyCSV(FPATH, skip_headers=True)
278 |         actual = list(lazy.sequence(col=0))
279 |         assert actual == [b"", b"0", b"1"]
280 |         actual = list(lazy.sequence(col=1))
281 |         assert actual == [b"ALPHA", b"a0", b"a1"]
282 |         actual = list(lazy.sequence(col=2))
283 |         assert actual == [b"BETA", b"b0", b"b1"]
284 | 
285 |     def test_get_skip_headers_row(self):
286 |         lazy = lazycsv.LazyCSV(FPATH, skip_headers=True)
287 |         row_0 = list(lazy.sequence(row=0))
288 |         assert row_0 == [b"", b"ALPHA", b"BETA"]
289 |         row_1 = list(lazy.sequence(row=1))
290 |         assert row_1 == [b"0", b"a0", b"b0"]
291 |         row_2 = list(lazy.sequence(row=2))
292 |         assert row_2 == [b"1", b"a1", b"b1"]
293 | 
294 |     def test_skipped_headers_full_row(self):
295 |         actual = b"this,that\n,\n,\n"
296 |         with prepped_file(actual) as tempf:
297 |             lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True)
298 |             actual = list(list(lazy.sequence(row=i)) for i in range(lazy.rows))
299 |         assert lazy.headers == ()
300 |         header, *rest = actual
301 |         assert header == [b"this", b"that"]
302 |         assert rest == [[b"", b""], [b"", b""]]
303 | 
304 |     def test_get_skip_headers_row_reversed(self):
305 |         lazy = lazycsv.LazyCSV(FPATH, skip_headers=True)
306 |         row_0 = list(lazy.sequence(row=0, reversed=True))
307 |         assert row_0 == [b"BETA", b"ALPHA", b""]
308 |         row_1 = list(lazy.sequence(row=1, reversed=True))
309 |         assert row_1 == [b"b0", b"a0", b"0"]
310 |         row_2 = list(lazy.sequence(row=2, reversed=True))
311 |         assert row_2 == [b"b1", b"a1", b"1"]
312 | 
313 |     def test_get_reversed_column(self, lazy):
314 |         actual = list(lazy.sequence(col=0, reversed=True))
315 |         assert actual == [b"1", b"0"]
316 |         actual = list(lazy.sequence(col=1, reversed=True))
317 |         assert actual == [b"a1", b"a0"]
318 |         actual = list(lazy.sequence(col=2, reversed=True))
319 |         assert actual == [b"b1", b"b0"]
320 | 
321 |     def test_get_skipped_header_column_reversed(self):
322 |         lazy = lazycsv.LazyCSV(FPATH, skip_headers=True)
323 |         actual = [list(lazy.sequence(col=i, reversed=True)) for i in range(lazy.cols)]
324 |         expected = [
325 |             [b"1", b"0", b""],
326 |             [b"a1", b"a0", b"ALPHA"],
327 |             [b"b1", b"b0", b"BETA"],
328 |         ]
329 |         assert actual == expected
330 | 
331 |     def test_get_reversed_row(self, lazy):
332 |         row_0 = list(lazy.sequence(row=0, reversed=True))
333 |         assert row_0 == [b"b0", b"a0", b"0"]
334 |         row_1 = list(lazy.sequence(row=1, reversed=True))
335 |         assert row_1 == [b"b1", b"a1", b"1"]
336 | 
337 |     def test_newlines_in_quote(self):
338 |         lazy = lazycsv.LazyCSV("fixtures/file_newline.csv", unquote=False)
339 |         assert lazy.headers == (b"", b'"This,that\n"', b'"Fizz,Buzz\r"')
340 |         actual = [list(lazy.sequence(col=i)) for i in range(lazy.cols)]
341 |         assert actual == [[b"0"], [b'"Goo,Bar\n"'], [b'"Bizz,Bazz"']]
342 | 
343 |     def test_buffer_size(self):
344 |         lazy = lazycsv.LazyCSV(FPATH, buffer_size=1024)
345 |         actual = list(lazy.sequence(col=0))
346 |         assert actual == [b"0", b"1"]
347 | 
348 |     def test_negative_buffer_size(self):
349 |         with pytest.raises(ValueError) as e:
350 |             lazycsv.LazyCSV(FPATH, buffer_size=-1)
351 |         assert e.type == ValueError
352 | 
353 |     def test_dirname(self):
354 |         tempdir = tempfile.TemporaryDirectory()
355 |         _ = lazycsv.LazyCSV(FPATH, index_dir=tempdir.name)
356 |         assert len(os.listdir(tempdir.name)) == 3
357 | 
358 | 
359 | class TestCRLF:
360 |     def test_crlf1(self):
361 |         lazy = lazycsv.LazyCSV("fixtures/file_crlf.csv")
362 | 
363 |         assert lazy.headers == (b"", b"A", b"B")
364 |         actual = list(lazy.sequence(col=0))
365 |         assert actual == [b"0"]
366 |         actual = list(lazy.sequence(col=1))
367 |         assert actual == [b"a0"]
368 |         actual = list(lazy.sequence(col=2))
369 |         assert actual == [b"b0"]
370 | 
371 |     def test_crlf2(self):
372 |         lazy = lazycsv.LazyCSV("fixtures/file_crlf2.csv", unquote=False)
373 | 
374 |         assert lazy.headers == (b"", b'"This,that"', b'"Fizz,Buzz"')
375 |         actual = list(lazy.sequence(col=0))
376 |         assert actual == [b"0"]
377 |         actual = list(lazy.sequence(col=1))
378 |         assert actual == [b'"Goo,Bar"']
379 |         actual = list(lazy.sequence(col=2))
380 |         assert actual == [b'"Bizz,Bazz"']
381 | 
382 | 
383 | class TestBigFiles:
384 |     def test_bigger_file(self, file_1000r_1000c):
385 |         lazy = lazycsv.LazyCSV(file_1000r_1000c.name)
386 |         actual = list(lazy.sequence(col=0))
387 |         assert len(actual) == 1000
388 | 
389 |     def test_variable_buffer_size(self, file_1000r_1000c):
390 |         lazy = lazycsv.LazyCSV(file_1000r_1000c.name, buffer_size=10**7)
391 |         actual = list(lazy.sequence(col=0))
392 |         assert len(actual) == 1000
393 | 
394 |     def test_big_sparse(self):
395 |         tempf = tempfile.NamedTemporaryFile()
396 |         cols, rows = 200, 200
397 |         headers = ",".join("col_{i}".format_map(dict(i=i)) for i in range(cols)) + "\n"
398 |         tempf.write(headers.encode("utf8"))
399 |         targets = {249, 499, 749, 999}
400 |         for _ in range(rows):
401 |             row = ",".join("{j}".format_map(dict(j=j)) if j in targets else "" for j in range(cols)) + "\n"
402 |             tempf.write(row.encode("utf8"))
403 |         tempf.flush()
404 | 
405 |         lazy = lazycsv.LazyCSV(tempf.name)
406 |         with open(tempf.name) as f:
407 |             reader = csv.reader(f)
408 |             headers = tuple(x.encode() for x in next(reader))
409 |             data = list(reader)
410 |         assert headers == lazy.headers
411 |         for val in range(cols):
412 |             expected = [i[val].encode() for i in data]
413 |             actual = list(lazy.sequence(col=val))
414 |             assert actual == expected
415 | 
416 | 
417 | class TestUnorderedFiles:
418 |     def test_missing_col(self):
419 |         data = "x,y,z\r\n1,2\r\n3,1,3\r\n".encode()
420 |         with prepped_file(data) as tempf, pytest.warns(RuntimeWarning):
421 |             lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True)
422 |             actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols))
423 |         expected = [[b"x", b"1", b"3"], [b"y", b"2", b"1"], [b"z", b"", b"3"]]
424 |         assert actual == expected
425 | 
426 |     def test_many_missing_cols(self):
427 |         data = "a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z\n\n0,,,,,,,,,,,,,,,,,,,,,,,,,1\n".encode()
428 |         with prepped_file(data) as tempf, pytest.warns(RuntimeWarning):
429 |             lazy = lazycsv.LazyCSV(tempf.name)
430 |         assert list(lazy.sequence(col=0)) == [b"", b"0"]
431 |         assert list(lazy.sequence(col=25)) == [b"", b"1"]
432 | 
433 |     def test_extra_col(self):
434 |         data = "x,y\r\n1,2,3\r\n4,5\r\n".encode()
435 |         with prepped_file(data) as tempf, pytest.warns(RuntimeWarning):
436 |             lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True)
437 |             actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols))
438 |         expected = [[b"x", b"1", b"4"], [b"y", b"2", b"5"]]
439 |         assert actual == expected
440 | 
441 |     def test_many_extra_col(self):
442 |         data = "x\r\n,,,,,,,,,,,,,,,,,,,,,,,,,,,\r\n4\r\n".encode()
443 |         with prepped_file(data) as tempf, pytest.warns(RuntimeWarning):
444 |             lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True)
445 |             actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols))
446 |         expected = [[b"x", b"", b"4"]]
447 |         assert actual == expected
448 | 
449 | 
450 | class TestEdgecases:
451 |     def test_many_files_separators(self):
452 |         for sep in ("\n", "\r", "\r\n"):
453 |             for i in range(250, 265):
454 |                 header = "A" * i
455 |                 data = "{header}{sep}1{sep}2".format_map(dict(header=header, sep=sep))
456 |                 with prepped_file(data.encode()) as tempf:
457 |                     lazy = lazycsv.LazyCSV(tempf.name)
458 |                     actual = list(lazy.sequence(col=0))
459 |                     headers = lazy.headers
460 |                 assert actual == [b"1", b"2"]
461 |                 assert headers == (header.encode(),)
462 | 
463 |     def test_many_empty_files_separators(self):
464 |         for sep in ("\n", "\r", "\r\n"):
465 |             for i in range(250, 261):
466 |                 header = "A" * i
467 |                 data = "{header}{sep}{sep}".format_map(dict(header=header, sep=sep))
468 |                 with prepped_file(data.encode()) as tempf:
469 |                     lazy = lazycsv.LazyCSV(tempf.name)
470 |                     actual = list(lazy.sequence(col=0))
471 |                     headers = lazy.headers
472 |                 assert actual == [b""]
473 |                 assert headers == (header.encode(),)
474 | 
475 |     def test_many_empty_files_separators_many_cols(self):
476 |         for sep in ("\n", "\r", "\r\n"):
477 |             for item in ("", "0"):
478 |                 for n in range(250, 261):
479 |                     header = ",".join(item for _ in range(n))
480 |                     data = ""
481 |                     for _ in range(n):
482 |                         data += header + sep
483 |                     with prepped_file(data.encode()) as tempf:
484 |                         lazy = lazycsv.LazyCSV(tempf.name)
485 |                         actual = list(lazy.sequence(col=0))
486 |                         headers = lazy.headers
487 |                         assert len(actual) == len(headers) - 1 == n - 1
488 |                         assert all(i == item.encode() for i in actual)
489 | 
490 |     def test_sparse_column(self):
491 |         data = "HEADER\n\n1\n\n2\n\n\n3\n"
492 |         with prepped_file(data.encode()) as tempf:
493 |             lazy = lazycsv.LazyCSV(tempf.name)
494 |             actual = list(lazy.sequence(col=0))
495 |             headers = lazy.headers
496 |             assert headers == (b"HEADER",)
497 |             assert actual == [b"", b"1", b"", b"2", b"", b"", b"3"]
498 | 
499 |     def test_sparse_crlf_column(self):
500 |         data = "HEADER\r\n\r\n1\r\n\r\n2\r\n\r\n\r\n3\r\n"
501 |         with prepped_file(data.encode()) as tempf:
502 |             lazy = lazycsv.LazyCSV(tempf.name)
503 |             actual = list(lazy.sequence(col=0))
504 |             headers = lazy.headers
505 |             assert headers == (b"HEADER",)
506 |             assert actual == [b"", b"1", b"", b"2", b"", b"", b"3"]
507 | 
508 |     def test_getitem_with_crlf_newline_at_eof(self):
509 |         data = 'x,y,z,str,date,quarter,ca_subvar_1,ca_subvar_2,ca_subvar_3,bool1,bool2,bool3\r\n1,2000-01-01T00:00:00,"range(-999.0,0.0)",red,2014-11-01T00:00:00,2014-10-01T00:00:00,1,1,2,1,1,0\r\n2,2000-01-02T00:00:00,No Data,green,2014-11-01T00:00:00,2014-10-01T00:00:00,1,2,3,1,0,0\r\n3,1950-12-24T00:00:00,1.234,reg-green-blue-whatever,2014-12-15T00:00:00,2014-10-01T00:00:00,2,3,4,0,1,0\r\n'.encode()
510 |         with prepped_file(data) as tempf:
511 |             lazy = lazycsv.LazyCSV(tempf.name)
512 |             actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols))
513 |         expected = [
514 |             [b"1", b"2", b"3"],
515 |             [b"2000-01-01T00:00:00", b"2000-01-02T00:00:00", b"1950-12-24T00:00:00"],
516 |             [b"range(-999.0,0.0)", b"No Data", b"1.234"],
517 |             [b"red", b"green", b"reg-green-blue-whatever"],
518 |             [b"2014-11-01T00:00:00", b"2014-11-01T00:00:00", b"2014-12-15T00:00:00"],
519 |             [b"2014-10-01T00:00:00", b"2014-10-01T00:00:00", b"2014-10-01T00:00:00"],
520 |             [b"1", b"1", b"2"],
521 |             [b"1", b"2", b"3"],
522 |             [b"2", b"3", b"4"],
523 |             [b"1", b"1", b"0"],
524 |             [b"1", b"0", b"1"],
525 |             [b"0", b"0", b"0"],
526 |         ]
527 |         assert expected == actual
528 | 
529 |     def test_problematic_numeric(self):
530 |         data = """\
531 |         ColName
532 |         -9
533 |         0
534 |         -179769313486000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
535 |         1.234
536 |         999
537 |         3.14159
538 |         -179769313486000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
539 |         """
540 |         data = textwrap.dedent(data)
541 |         data = data.encode()
542 |         with prepped_file(data) as tempf:
543 |             lazy = lazycsv.LazyCSV(tempf.name, skip_headers=True)
544 |             actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols))
545 |         expected = [data.split()]
546 |         assert actual == expected
547 | 
548 |     def test_encoded_headers(self):
549 |         data = '"Göteborg","Sverige",Umeå,Köln,東京,deltaΔdelta\nc1,c2,c3,c4,c5,c6\n'
550 |         data = data.encode()
551 |         with prepped_file(data) as tempf:
552 |             lazy = lazycsv.LazyCSV(tempf.name)
553 |             assert lazy.headers == (
554 |                 "Göteborg".encode(),
555 |                 "Sverige".encode(),
556 |                 "Umeå".encode(),
557 |                 "Köln".encode(),
558 |                 "東京".encode(),
559 |                 "delta\u0394delta".encode(),
560 |             )
561 |             actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols))
562 |         expected = [[b"c1"], [b"c2"], [b"c3"], [b"c4"], [b"c5"], [b"c6"]]
563 |         assert actual == expected
564 | 
565 |     def test_crlf_no_newline(self):
566 |         actual = "header1,header2,header3\r\n1,2,3\r\n4,5,6\r\n7,8,9".encode()
567 |         with prepped_file(actual) as tempf:
568 |             lazy = lazycsv.LazyCSV(tempf.name)
569 |             actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols))
570 |         expected = [[b"1", b"4", b"7"], [b"2", b"5", b"8"], [b"3", b"6", b"9"]]
571 |         assert actual == expected
572 | 
573 |     def test_empty_headers(self):
574 |         data = (
575 |             '" ","","repeated"\n'
576 |             '2557," Bagua "," Amazonas"\n'
577 |             '2563," Bongara "," Amazonas"\n'
578 |             '2535," Chachapoyas "," Amazonas"\n'
579 |             '2576," Condorcanqui "," Amazonas"\n'
580 |         )
581 |         data = textwrap.dedent(data)
582 |         data = data.encode()
583 |         with prepped_file(data) as tempf:
584 |             lazy = lazycsv.LazyCSV(tempf.name)
585 |             assert lazy.headers == (b" ", b"", b"repeated")
586 |             actual = list(list(lazy.sequence(col=i)) for i in range(lazy.cols))
587 |         expected = [
588 |             [b"2557", b"2563", b"2535", b"2576"],
589 |             [b" Bagua ", b" Bongara ", b" Chachapoyas ", b" Condorcanqui "],
590 |             [b" Amazonas", b" Amazonas", b" Amazonas", b" Amazonas"],
591 |         ]
592 |         assert actual == expected
593 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py35,py36,py37,py38,py39,py310,py311
 3 | 
 4 | [testenv]
 5 | deps =
 6 |     pytest
 7 |     numpy
 8 | setenv =
 9 |     LAZYCSV_INCLUDE_NUMPY=1
10 | commands =
11 |     python -m pytest {posargs}
12 | 
13 | 


--------------------------------------------------------------------------------