├── .gitattributes
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── cpython
    ├── csvmonkey.cpp
    ├── file_stream_cursor.hpp
    └── iterator_stream_cursor.hpp
├── docs
    ├── Makefile
    ├── conf.py
    ├── cpp.rst
    ├── index.rst
    ├── python.rst
    ├── static
    │   └── .empty
    └── templates
    │   ├── github.html
    │   ├── globaltoc.html
    │   ├── layout.html
    │   └── piwik-config.js
├── include
    └── csvmonkey.hpp
├── scripts
    ├── calc.py
    ├── compare.py
    ├── csvcut.py
    ├── dequote.py
    └── makesum.py
├── setup.py
├── tests
    ├── .gitignore
    ├── CMakeLists.txt
    ├── _stringspanner_test.cpp
    ├── bench
    │   └── iteration.cpp
    ├── catch.hpp
    ├── csvmonkey_test.py
    ├── data
    │   ├── anon-ram.csv.zstd
    │   └── profiledata.csv
    ├── fallback_stringspanner_test.cpp
    ├── fullsum.cpp
    ├── main.cpp
    ├── parser_test.py
    └── sse42_stringspanner_test.cpp
└── third_party
    ├── cpuid.py
    └── picosha2.h


/.gitattributes:
--------------------------------------------------------------------------------
1 | **.zstd filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ram.*
2 | perf.data*
3 | **.*.sw[op]
4 | tests/bench/iteration
5 | *.pyc
6 | build
7 | **.so
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2017, David Wilson
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are met:
 5 | 
 6 | 1. Redistributions of source code must retain the above copyright notice, this
 7 | list of conditions and the following disclaimer.
 8 | 
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 | this list of conditions and the following disclaimer in the documentation
11 | and/or other materials provided with the distribution.
12 | 
13 | 3. Neither the name of the copyright holder nor the names of its contributors
14 | may be used to endorse or promote products derived from this software without
15 | specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | graft third_party
2 | graft cpython
3 | graft include
4 | include LICENSE
5 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | CXXFLAGS += -Iinclude
 3 | CXXFLAGS += -Ithird_party
 4 | 
 5 | CXXFLAGS += -std=c++11
 6 | CXXFLAGS += -O3
 7 | CXXFLAGS += -Wall
 8 | CXXFLAGS += -lc
 9 | #CXXFLAGS += -DUSE_SPIRIT
10 | 
11 | default: debug python
12 | 
13 | python:
14 | 	rm -rf build
15 | 	python setup.py build_ext --inplace
16 | 
17 | debug: CXXFLAGS+=-O0 -g
18 | debug: tests/bench/iteration
19 | 
20 | release: X=-DNDEBUG
21 | release: tests/bench/iteration tests/fullsum
22 | 
23 | tests/bench/iteration: tests/bench/iteration.cpp include/csvmonkey.hpp Makefile
24 | 	g++ -std=c++11 $(CXXFLAGS) -msse4.2 $(X) -g -o tests/bench/iteration tests/bench/iteration.cpp
25 | 
26 | tests/fullsum: tests/fullsum.cpp include/csvmonkey.hpp Makefile
27 | 	g++ -std=c++11 $(CXXFLAGS) -msse4.2 $(X) -g -o tests/fullsum tests/fullsum.cpp
28 | 
29 | clean:
30 | 	rm -f tests/fullsum tests/bench/iteration cachegrind* perf.data* *.gcda
31 | 
32 | pgo: X+=-DNDEBUG
33 | pgo:
34 | 	g++ -std=c++11 $(CXXFLAGS) -DNDEBUG -fprofile-generate -msse4.2 $(X) -g -o tests/bench/iteration tests/bench/iteration.cpp
35 | 	./tests/bench/iteration tests/data/profiledata.csv
36 | 	g++ -std=c++11 $(CXXFLAGS) -DNDEBUG -fprofile-use -msse4.2 $(X) -g -o tests/bench/iteration tests/bench/iteration.cpp
37 | 
38 | grind:
39 | 	rm -f cachegrind.out.*
40 | 	valgrind --tool=cachegrind --branch-sim=yes ./tests/bench/iteration ram.64mb.csv
41 | 	cg_annotate --auto=yes cachegrind.out.*
42 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # csvmonkey
  2 | 
  3 | This is a header-only vectorized, lazy-decoding, zero-copy CSV file parser.
  4 | Given appropriate input data and hardware, the C++ version can tokenize ~1.9
  5 | GiB/sec of input in one thread. For a basic summing task, the Python version is
  6 | ~5x faster than `csv.reader` and ~10x faster than `csv.DictReader`, while
  7 | maintaining a similarly usable interface.
  8 | 
  9 | **This still requires a ton of work. For now it's mostly toy code.**
 10 | 
 11 | Requires a CPU supporting Intel SSE4.2 and a C++11 compiler that bundles
 12 | `smmintrin.h`. For non-SSE4.2 machines, a reasonable fallback implementation is
 13 | also provided.
 14 | 
 15 | As of writing, csvmonkey comfortably leads <a
 16 | href="https://bitbucket.org/ewanhiggs/csv-game">Ewan Higg's csv-game</a>
 17 | microbenchmark of 24 CSV parsers.
 18 | 
 19 | 
 20 | ## How It Works
 21 | 
 22 | * **Vectorized**: scanning for values that change parser state is done using
 23 |   Intel SSE 4.2 PCMPISTRI instruction. PCMPISTRI can locate the first occurence
 24 |   of up to four values within a 16 byte vector, allowing searching 16 input
 25 |   bytes for end of line, escape, quote, or field separators in one instruction.
 26 | 
 27 | * **Zero Copy**: the user supplies the parser's input buffer. The output is an
 28 |   array of column offsets within a row, each flagged to indicate whether an
 29 |   escape character was detected. The highest throughput is achieved in
 30 |   combination with memory-mapped files, where none of the OS, application or
 31 |   parser make any bulk copies.
 32 | 
 33 | * **Lazy Decoding**: input is not copied or unquoted until requested. Since a
 34 |   flag indicates the presence of escapes, a fast path can avoid any bytewise
 35 |   decode in the usual case where no escape is present. Due to lazy decoding,
 36 |   csvmonkey is extremely effective at tasks that scan only a subset of data,
 37 |   for example one or two columns from a large many-columned CSV. This use case
 38 |   is the original motivation for the design.
 39 | 
 40 | * **Header Only**: the parser has no third-party dependencies, just some
 41 |   templates defined in ``csvmonkey.hpp``.
 42 | 
 43 | 
 44 | ## Python Usage
 45 | 
 46 | You can install the library through pip:
 47 | 
 48 | ```
 49 | pip install csvmonkey
 50 | ```
 51 | 
 52 | If this fails on ubuntu, first install `clang`:
 53 | 
 54 | ```
 55 | sudo apt-get install clang
 56 | ```
 57 | 
 58 | Then run:
 59 | 
 60 | ```
 61 | CC=clang pip install csvmonkey
 62 | ```
 63 | 
 64 | You can also install the library locally by cloning this repo and running:
 65 | 
 66 | ```
 67 | pip install -e .
 68 | ```
 69 | 
 70 | Then you can use it likewise:
 71 | 
 72 | 1. `import csvmonkey`
 73 | 2. `csvmonkey.from_path()` for a memory-mapped file, `csvmonkey.from_file()`
 74 |    for any file-like object with a `read()` method, or `csvmonkey.from_iter()`
 75 |    for any iterable object that yields lines or file chunks, e.g.
 76 |    `from_iter(open("ram.csv"))`.
 77 | 
 78 | By default a magical `Row` object is yielded during iteration. This object is
 79 | only a window into the currently parsed data, and becomes invalid upon the next
 80 | iteration. Row data is accessed either by index or by key (if `header=True`)
 81 | using:
 82 | 
 83 | ```
 84 | for row in csvmonkey.from_path("ram.csv", header=True):
 85 |     row[20]  # by index
 86 |     row["UnBlendedCost"]  # by header value
 87 | ```
 88 | 
 89 | If your CSV contains a header, specify `header=True` to read it during
 90 | construction. If the CSV lacks a header but dict-like behaviour is desired,
 91 | pass a header explicitly as `header=("a", "b", "c", "d")`.
 92 | 
 93 | Element access causes the relevant chunk of the row to be copied to the heap
 94 | and returned as a Python string.
 95 | 
 96 | Rows may be converted to dicts via `row.asdict()`, tuples via
 97 | `row.astuple()` or lists via `row.aslist()`. If you want rows to be produced
 98 | directly in concrete form, pass `yields="list"`, `yields="tuple"`,
 99 | `yields="dict"` keyword arguments.
100 | 
101 | 
102 | ### Unicode
103 | 
104 | Unicode is supported for character sets where delimiters and line endings are
105 | represented by one byte. To configure Unicode, pass an ``encoding`` parameter,
106 | and optionally an ``errors`` parameter.
107 | 
108 | * "bytes": Return bytes (default on Python 2)
109 | * "utf-8": Decode as UTF-8 (default on Python 3)
110 | * "ascii": Decode as ASCII
111 | * "latin1": Decode as LATIN1
112 | * "locale": Decode according to the active C locale
113 | * "...": Decode according some codec "..." known to Python
114 | 
115 | Where possible, prefer exact spelling and case matching one of the above
116 | encodings, to ensure an associated fast path is used.
117 | 
118 | 
119 | ## Python Benchmark
120 | 
121 | ram.csv is 614MiB with 1,540,093 records of 22 columns and approximately 418
122 | bytes per record. An anonymized version is checked into LFS as
123 | ``tests/data/anon-ram.csv.zstd``.
124 | 
125 | Python 2.7 Sum: convert to float and sum single column:
126 | 
127 | | Mode                     | Rate       | Ratio | i7-6700HQ | Xeon E5530 | Core i5-2435M |
128 | |--------------------------|------------|-------|-----------|------------|---------------|
129 | | csvmonkey lazy decode    | 1098 MiB/s | -     | 0.559s    | 0.9s       | 1.29s         |
130 | | csvmonkey yields="tuple" | 642 MiB/s  | 1.7x  | 0.956s    | 1.87s      | 2.17s         |
131 | | csvmonkey yields="dict"  | 281 MiB/s  | 3.8x  | 2.18s     | 4.57s      | 5.04s         |
132 | | csv.reader               | 223 MiB/s  | 4.9x  | 2.75s     | 5.88s      | 11.1s         |
133 | | csv.DictReader           | 85 MiB/s   | 12.7x | 7.15s     | 16.3s      | 25.0s         |
134 | 
135 | Python 2.7 No-op: Iterate complete file, no other processing:
136 | 
137 | | Mode                     | Rate       | Ratio | i7-6700HQ | Xeon E5530 |
138 | |--------------------------|------------|-------|-----------|------------|
139 | | csvmonkey lazy decode    | 1906 MiB/s | -     | 0.322s    | 0.444s     |
140 | | csvmonkey yields="tuple" | 831 MiB/s  | 2.3x  | 0.738s    | 1.4s       |
141 | | csvmonkey yields="dict"  | 318 MiB/s  | 6.0x  | 1.93s     | 4.26s      |
142 | | csv.reader               | 248 MiB/s  | 7.6x  | 2.47s     | 5.31s      |
143 | | csv.DictReader           | 92 MiB/s   | 20.5x | 6.62s     | 15.2s      |
144 | 
145 | Python 3.6 No-op: Iterate complete file, includes charset decoding
146 | 
147 | | Mode                                        | Rate       | Ratio      | i7-6700HQ  |
148 | |---------------------------------------------|------------|------------|------------|
149 | | csvmonkey lazy decode                       | 1906 MiB/s | -          | 0.322s     |
150 | | csvmonkey yields="tuple", encoding="bytes"  | 833 MiB/s  | 2.3x       | 0.737s     |
151 | | csvmonkey yields="tuple", encoding="latin1" | 579 MiB/s  | 3.3x       | 1.06s      |
152 | | csvmonkey yields="tuple"                    | 495 MiB/s  | 3.8x       | 1.24s      |
153 | | csvmonkey yields="dict"                     | 235 MiB/s  | 8.1x       | 2.61s      |
154 | | csv.reader                                  | 121 MiB/s  | 15.7x      | 5.07s      |
155 | | csv.DictReader                              | 55 MiB/s   | 34.4x      | 11.1s      |
156 | 
157 | 
158 | ### Command lines
159 | 
160 | Sum:
161 | 
162 | ```
163 | python -mtimeit -n1 -r3 -s 'import csvmonkey' 'sum(float(row["UnBlendedCost"]) for row in csvmonkey.from_path("ram.csv", header=True))'
164 | python -mtimeit -n1 -r3 -s 'import csvmonkey' 'sum(float(row[20]) for row in csvmonkey.from_path("ram.csv", header=True, yields="tuple"))'
165 | python -mtimeit -n1 -r3 -s 'import csvmonkey' 'sum(float(row["UnBlendedCost"]) for row in csvmonkey.from_path("ram.csv", header=True, yields="dict"))'
166 | python -mtimeit -n1 -r3 -s 'import csv' 'r = csv.reader(open("ram.csv")); next(r); sum(float(row[20]) for row in r)'
167 | python -mtimeit -n1 -r3 -s 'import csv' 'sum(float(row["UnBlendedCost"]) for row in csv.DictReader(open("ram.csv")))'
168 | ```
169 | 
170 | No-op:
171 | 
172 | ```
173 | python -mtimeit -n1 -r3 -s 'import csvmonkey' 'all(csvmonkey.from_path("ram.csv", header=True))'
174 | python -mtimeit -n1 -r3 -s 'import csvmonkey' 'all(csvmonkey.from_path("ram.csv", header=True, yields="tuple"))'
175 | python -mtimeit -n1 -r3 -s 'import csvmonkey' 'all(csvmonkey.from_path("ram.csv", header=True, yields="dict"))'
176 | python -mtimeit -n1 -r3 -s 'import csv' 'all(csv.reader(open("ram.csv")))'
177 | python -mtimeit -n1 -r3 -s 'import csv' 'all(csv.DictReader(open("ram.csv")))'
178 | ```
179 | 
180 | 
181 | ## C++ Usage
182 | 
183 | 1. Copy `csvmonkey.hpp` to your project and include it.
184 | 1. `CFLAGS=-msse4.2 -O3`
185 | 1. See `Makefile` for an example of producing a profile-guided build (worth an
186 |    extra few %).
187 | 1. Instantiate `MappedFileCursor` (zero copy) or `FdStreamCursor` (buffered), attach it to a `CsvReader`.
188 | 1. Invoke `read_row()` and use `row().by_value()` to pick out `CsvCell` pointers for your desired columns.
189 | 1. Pump `read_row()` in a loop and use cell's `ptr()`, `size()`, `as_str()`, `equals()` and `as_double()` methods while `read_row()` returns true.
190 | 
191 | 
192 | # TODO
193 | 
194 | * COW pointer interface to `as_str()`.
195 | * ~~Finish Python 3 support~~
196 | * ~~Ensure Python ReaderObject is always 16-byte aligned~~
197 | * Fix handling of last row when it:
198 |     * lacks newline, or
199 |     * is truncated after final quote, or
200 |     * is truncated within a quote, or
201 |     * is truncated within an escape
202 | * Restartable: fix quadratic behaviour when `StreamCursor` yields lines and CSV
203 |   rows span lines
204 | * ~~Python `from_file()` that uses `read()` in preference to `__iter__()`.~~
205 | * ~~Fix CRLF / LFCR handling.~~
206 | * ~~`StreamCursor` error / exception propagation.~~
207 | * ~~Remove hard 256 column limit & fix crash if it's exceeded.~~
208 | * ~~Ensure non-SSE fallback return codes match SSE when not found.~~
209 | * ~~Map single zero page after file pages in MappedFileCursor~~
210 | * ~~Add trailing 16 NUL bytes to BufferedStreamCursor~~
211 | * ~~Remove hard-coded page size~~
212 | * ~~(Single byte separator) Unicode support.~~
213 | * (Multi byte separator) Unicode support.
214 | 


--------------------------------------------------------------------------------
/cpython/csvmonkey.cpp:
--------------------------------------------------------------------------------
   1 | 
   2 | #include <Python.h>
   3 | 
   4 | #include "csvmonkey.hpp"
   5 | #include "iterator_stream_cursor.hpp"
   6 | #include "file_stream_cursor.hpp"
   7 | 
   8 | using namespace csvmonkey;
   9 | 
  10 | extern PyTypeObject CellType;
  11 | extern PyTypeObject ReaderType;
  12 | extern PyTypeObject RowType;
  13 | struct RowObject;
  14 | 
  15 | 
  16 | enum CursorType
  17 | {
  18 |     CURSOR_MAPPED_FILE,
  19 |     CURSOR_ITERATOR,
  20 |     CURSOR_PYTHON_FILE
  21 | };
  22 | 
  23 | 
  24 | typedef PyObject *(*to_string_fn)(struct ReaderObject *, CsvCell *);
  25 | 
  26 | struct ReaderObject
  27 | {
  28 |     PyObject_HEAD
  29 |     CursorType cursor_type;
  30 |     StreamCursor *cursor;
  31 |     // CsvReader cannot be inline because pymalloc does not satisfy alignment
  32 |     // requirement
  33 |     CsvReader<> *reader;
  34 |     to_string_fn to_string;
  35 |     PyObject *(*yields)(RowObject *);
  36 |     int header;
  37 |     size_t record; // Current record number
  38 | 
  39 |     CsvCursor *row;
  40 |     PyObject *py_row;
  41 | 
  42 |     // Unicode.
  43 |     const char *encoding; // unused unless to_string==cell_to_unicode
  44 |     const char *errors; // "strict", "ignore", "..."
  45 | 
  46 |     // Map header string -> index.
  47 |     PyObject *header_map;
  48 | };
  49 | 
  50 | 
  51 | struct CellObject
  52 | {
  53 |     PyObject_HEAD;
  54 |     ReaderObject *reader; // strong ref.
  55 |     CsvCell *cell;
  56 | };
  57 | 
  58 | 
  59 | struct RowObject
  60 | {
  61 |     PyObject_HEAD;
  62 |     ReaderObject *reader; // strong ref, keeps row alive.
  63 |     CsvCursor *row;
  64 | };
  65 | 
  66 | 
  67 | /*
  68 |  * String factories.
  69 |  */
  70 | 
  71 | #if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 3
  72 | #   define HAS_LOCALE
  73 | #endif
  74 | 
  75 | 
  76 | static PyObject *
  77 | cell_to_bytes(ReaderObject *reader, CsvCell *cell)
  78 | {
  79 |     if(! cell->escaped) {
  80 |         return PyBytes_FromStringAndSize(cell->ptr, cell->size);
  81 |     }
  82 |     auto s = cell->as_str();
  83 |     return PyBytes_FromStringAndSize(&s[0], s.size());
  84 | }
  85 | 
  86 | 
  87 | static PyObject *
  88 | cell_to_utf8(ReaderObject *reader, CsvCell *cell)
  89 | {
  90 |     if(! cell->escaped) {
  91 |         return PyUnicode_DecodeUTF8(cell->ptr, cell->size, reader->errors);
  92 |     }
  93 |     auto s = cell->as_str();
  94 |     return PyUnicode_DecodeUTF8(&s[0], s.size(), reader->errors);
  95 | }
  96 | 
  97 | 
  98 | static PyObject *
  99 | cell_to_ascii(ReaderObject *reader, CsvCell *cell)
 100 | {
 101 |     if(! cell->escaped) {
 102 |         return PyUnicode_DecodeASCII(cell->ptr, cell->size, reader->errors);
 103 |     }
 104 |     auto s = cell->as_str();
 105 |     return PyUnicode_DecodeASCII(&s[0], s.size(), reader->errors);
 106 | }
 107 | 
 108 | 
 109 | static PyObject *
 110 | cell_to_latin1(ReaderObject *reader, CsvCell *cell)
 111 | {
 112 |     if(! cell->escaped) {
 113 |         return PyUnicode_DecodeLatin1(cell->ptr, cell->size, reader->errors);
 114 |     }
 115 |     auto s = cell->as_str();
 116 |     return PyUnicode_DecodeLatin1(&s[0], s.size(), reader->errors);
 117 | }
 118 | 
 119 | 
 120 | #ifdef HAS_LOCALE
 121 | static PyObject *
 122 | cell_to_locale(ReaderObject *reader, CsvCell *cell)
 123 | {
 124 |     if(! cell->escaped) {
 125 |         return PyUnicode_DecodeLocaleAndSize(cell->ptr, cell->size, reader->errors);
 126 |     }
 127 |     auto s = cell->as_str();
 128 |     return PyUnicode_DecodeLocaleAndSize(&s[0], s.size(), reader->errors);
 129 | }
 130 | #endif
 131 | 
 132 | 
 133 | static PyObject *
 134 | cell_to_unicode(ReaderObject *reader, CsvCell *cell)
 135 | {
 136 |     if(! cell->escaped) {
 137 |         return PyUnicode_Decode(cell->ptr, cell->size,
 138 |             reader->encoding, reader->errors);
 139 |     }
 140 |     auto s = cell->as_str();
 141 |     return PyUnicode_Decode(&s[0], s.size(), reader->encoding, reader->errors);
 142 | }
 143 | 
 144 | 
 145 | /*
 146 |  * Cell methods.
 147 |  */
 148 | 
 149 | static PyObject *
 150 | cell_as_double(CellObject *self)
 151 | {
 152 |     return PyFloat_FromDouble(self->cell->as_double());
 153 | }
 154 | 
 155 | 
 156 | static PyObject *
 157 | cell_as_str(CellObject *self)
 158 | {
 159 |     ReaderObject *r = self->reader;
 160 |     return r->to_string(r, self->cell);
 161 | }
 162 | 
 163 | static PyObject *
 164 | cell_equals(CellObject *self, PyObject *args)
 165 | {
 166 |     if(! PyTuple_GET_SIZE(args)) {
 167 |         return NULL;
 168 |     }
 169 | 
 170 |     PyObject *py_s = PyTuple_GET_ITEM(args, 0);
 171 |     if(! PyBytes_CheckExact(py_s)) {
 172 |         return NULL;
 173 |     }
 174 | 
 175 |     const char *s = PyBytes_AS_STRING(py_s);
 176 |     PyObject *py_true = (
 177 |         self->cell->equals(s) ?
 178 |         Py_True :
 179 |         Py_False
 180 |     );
 181 |     Py_INCREF(py_true);
 182 |     return py_true;
 183 | }
 184 | 
 185 | 
 186 | static int
 187 | cell_compare(PyObject *self_, PyObject *o2)
 188 | {
 189 |     CellObject *self = (CellObject *) self_;
 190 |     if(! PyBytes_CheckExact(o2)) {
 191 |         return -1;
 192 |     }
 193 | 
 194 |     const char *s = PyBytes_AS_STRING(o2);
 195 |     return self->cell->equals(s) ? 0 : -1;
 196 | }
 197 | 
 198 | 
 199 | static PyObject *
 200 | cell_richcmp(PyObject *self_, PyObject *o2, int op)
 201 | {
 202 |     CellObject *self = (CellObject *) self_;
 203 |     if(! PyBytes_CheckExact(o2)) {
 204 |         return NULL;
 205 |     }
 206 | 
 207 |     PyObject *out = Py_NotImplemented;
 208 |     if(op == Py_EQ) {
 209 |         const char *s = PyBytes_AS_STRING(o2);
 210 |         out = self->cell->equals(s) ? Py_True : Py_False;
 211 |     }
 212 | 
 213 |     Py_INCREF(out);
 214 |     return out;
 215 | }
 216 | 
 217 | 
 218 | static PyObject *
 219 | cell_new(ReaderObject *reader, CsvCell *cell)
 220 | {
 221 |     CellObject *py_cell = (CellObject *) PyObject_New(CellObject, &CellType);
 222 |     if(! py_cell) {
 223 |         return NULL;
 224 |     }
 225 | 
 226 |     Py_INCREF(reader);
 227 |     py_cell->reader = reader;
 228 |     py_cell->cell = cell;
 229 |     return (PyObject *) py_cell;
 230 | }
 231 | 
 232 | 
 233 | static void
 234 | cell_dealloc(PyObject *self_)
 235 | {
 236 |     CellObject *self = (CellObject *) self_;
 237 |     Py_DECREF(self->reader);
 238 |     self->reader = NULL;
 239 |     self->cell = NULL;
 240 | }
 241 | 
 242 | 
 243 | /*
 244 |  * Row methods
 245 |  */
 246 | 
 247 | static int
 248 | row_clear(RowObject *self)
 249 | {
 250 |     Py_CLEAR(self->reader);
 251 |     return 0;
 252 | }
 253 | 
 254 | 
 255 | static void
 256 | row_dealloc(RowObject *self)
 257 | {
 258 |     self->row = NULL;
 259 |     Py_CLEAR(self->reader);
 260 |     Py_TYPE(self)->tp_free((PyObject *)self);
 261 | }
 262 | 
 263 | 
 264 | static int
 265 | row_traverse(RowObject *self, visitproc visit, void *arg)
 266 | {
 267 |     Py_VISIT(self->reader);
 268 |     return 0;
 269 | }
 270 | 
 271 | 
 272 | static PyObject *
 273 | row_new(ReaderObject *reader)
 274 | {
 275 |     RowObject *self = (RowObject *) PyObject_GC_New(RowObject, &RowType);
 276 |     if(self) {
 277 |         Py_INCREF(reader);
 278 |         self->reader = reader;
 279 |     }
 280 |     self->row = reader->row;
 281 |     PyObject_GC_Track((PyObject *) self);
 282 |     return (PyObject *) self;
 283 | }
 284 | 
 285 | 
 286 | static PyObject *
 287 | row_aslist(RowObject *self)
 288 | {
 289 |     PyObject *lst = PyList_New(self->row->count);
 290 |     if(lst) {
 291 |         int count = self->row->count;
 292 |         CsvCell *cell = &self->row->cells[0];
 293 |         ReaderObject *r = self->reader;
 294 |         to_string_fn to_string = r->to_string;
 295 | 
 296 |         for(int i = 0; i < count; i++, cell++) {
 297 |             PyObject *s = to_string(r, cell);
 298 |             if(! s) {
 299 |                 Py_CLEAR(lst);
 300 |                 break;
 301 |             }
 302 |             PyList_SET_ITEM(lst, i, s);
 303 |         }
 304 |     }
 305 | 
 306 |     return lst;
 307 | }
 308 | 
 309 | 
 310 | static PyObject *
 311 | row_astuple(RowObject *self)
 312 | {
 313 |     PyObject *tup = PyTuple_New(self->row->count);
 314 |     if(tup) {
 315 |         int count = self->row->count;
 316 |         CsvCell *cell = &self->row->cells[0];
 317 |         ReaderObject *r = self->reader;
 318 |         to_string_fn to_string = r->to_string;
 319 | 
 320 |         for(int i = 0; i < count; i++, cell++) {
 321 |             PyObject *s = to_string(r, cell);
 322 |             if(! s) {
 323 |                 Py_CLEAR(tup);
 324 |                 break;
 325 |             }
 326 |             PyTuple_SET_ITEM(tup, i, s);
 327 |         }
 328 |     }
 329 | 
 330 |     return tup;
 331 | }
 332 | 
 333 | 
 334 | static PyObject *
 335 | row_asdict(RowObject *self)
 336 | {
 337 |     if(! self->reader->header_map) {
 338 |         PyErr_Format(PyExc_TypeError,
 339 |                      "Cannot convert to dict; no header is present");
 340 |         return NULL;
 341 |     }
 342 | 
 343 |     PyObject *out = PyDict_New();
 344 |     if(out) {
 345 |         Py_ssize_t ppos = 0;
 346 |         PyObject *key;
 347 |         PyObject *value;
 348 | 
 349 |         CsvCell *cells = &self->row->cells[0];
 350 |         ReaderObject *r = self->reader;
 351 |         to_string_fn to_string = r->to_string;
 352 | 
 353 |         while(PyDict_Next(self->reader->header_map, &ppos, &key, &value)) {
 354 |             int i = PyLong_AsLong(value);
 355 |             if(i < self->row->count) {
 356 |                 PyObject *s = to_string(r, &cells[i]);
 357 |                 if(! s) {
 358 |                     Py_CLEAR(out);
 359 |                     break;
 360 |                 }
 361 | 
 362 |                 if(PyDict_SetItem(out, key, s)) {
 363 |                     Py_DECREF(s);
 364 |                     Py_CLEAR(out);
 365 |                     break;
 366 |                 }
 367 | 
 368 |                 Py_DECREF(s);
 369 |             }
 370 |         }
 371 |     }
 372 | 
 373 |     return out;
 374 | }
 375 | 
 376 | 
 377 | static PyObject *
 378 | row_return_self(RowObject *self)
 379 | {
 380 |     Py_INCREF(self);
 381 |     return (PyObject *) self;
 382 | }
 383 | 
 384 | 
 385 | static PyObject *
 386 | row_repr(RowObject *self)
 387 | {
 388 |     PyObject *obj;
 389 | 
 390 |     if(self->reader->header) {
 391 |         obj = row_asdict(self);
 392 |     } else {
 393 |         obj = row_astuple(self);
 394 |     }
 395 | 
 396 |     if(! obj) {
 397 |         return NULL;
 398 |     }
 399 | 
 400 |     PyObject *repr = PyUnicode_FromFormat(
 401 |         "<csvmonkey._Row positioned at %R>",
 402 |         obj
 403 |     );
 404 |     Py_DECREF(obj);
 405 |     return repr;
 406 | }
 407 | 
 408 | 
 409 | static Py_ssize_t
 410 | row_length(RowObject *self)
 411 | {
 412 |     return self->row->count;
 413 | }
 414 | 
 415 | 
 416 | static PyObject *
 417 | row_getitem(RowObject *self, Py_ssize_t index)
 418 | {
 419 |     if(index < 0) {
 420 |         index = self->row->count + index;
 421 |     }
 422 | 
 423 |     if(index < 0 || index > self->row->count) {
 424 |         PyErr_Format(PyExc_IndexError,
 425 |                      "index %ld greater than parsed col count %lu",
 426 |                      (unsigned long) index,
 427 |                      (unsigned long) self->row->count);
 428 |         return NULL;
 429 |     }
 430 | 
 431 |     ReaderObject *r = self->reader;
 432 |     return r->to_string(r, &self->row->cells[index]);
 433 | }
 434 | 
 435 | 
 436 | static Py_ssize_t
 437 | row_getlength(RowObject *self)
 438 | {
 439 |     return (Py_ssize_t) self->row->count;
 440 | }
 441 | 
 442 | 
 443 | static PyObject *
 444 | row_subscript(RowObject *self, PyObject *key)
 445 | {
 446 |     int index;
 447 | 
 448 |     if(PyLong_Check(key)) {
 449 |         index = (int) PyLong_AsLong(key);
 450 |         if(index < 0) {
 451 |             index = self->row->count + index;
 452 |         }
 453 | #if PY_MAJOR_VERSION < 3
 454 |     } else if(PyInt_Check(key)) {
 455 |         index = (int) PyInt_AS_LONG(key);
 456 |         if(index < 0) {
 457 |             index = self->row->count + index;
 458 |         }
 459 | #endif
 460 |     } else if(! self->reader->header_map) {
 461 |         PyErr_Format(PyExc_IndexError, "Reader instantiated with header=False");
 462 |         return NULL;
 463 |     } else {
 464 |         PyObject *py_index = PyDict_GetItem(self->reader->header_map, key);
 465 |         if(! py_index) {
 466 |             PyErr_Format(PyExc_KeyError, "No such key.");
 467 |             return NULL;
 468 |         }
 469 |         index = (int) PyLong_AsLong(py_index);
 470 |     }
 471 | 
 472 |     if(index < 0 || index > self->row->count) {
 473 |         PyErr_Format(PyExc_IndexError,
 474 |                      "index %ld greater than parsed col count %lu",
 475 |                      (unsigned long) index,
 476 |                      (unsigned long) self->row->count);
 477 |         return NULL;
 478 |     }
 479 | 
 480 |     ReaderObject *r = self->reader;
 481 |     return r->to_string(r, &self->row->cells[index]);
 482 | }
 483 | 
 484 | 
 485 | static PyObject *
 486 | row_iter(RowObject *self)
 487 | {
 488 |     PyObject *tup = row_astuple(self);
 489 |     if(! tup) {
 490 |         return NULL;
 491 |     }
 492 | 
 493 |     PyObject *iter = PyObject_GetIter(tup);
 494 |     Py_DECREF(tup);
 495 |     return iter;
 496 | }
 497 | 
 498 | 
 499 | /*
 500 |  * Reader methods
 501 |  */
 502 | 
 503 | static int
 504 | reader_clear(ReaderObject *self)
 505 | {
 506 |     Py_CLEAR(self->py_row);
 507 |     Py_CLEAR(self->header_map);
 508 |     return 0;
 509 | }
 510 | 
 511 | 
 512 | static void
 513 | delete_cursor(CursorType type, StreamCursor *cursor)
 514 | {
 515 |     switch(type) {
 516 |     case CURSOR_MAPPED_FILE:
 517 |         delete (MappedFileCursor *)cursor;
 518 |         break;
 519 |     case CURSOR_ITERATOR:
 520 |         delete (IteratorStreamCursor *)cursor;
 521 |         break;
 522 |     case CURSOR_PYTHON_FILE:
 523 |         delete (FileStreamCursor *)cursor;
 524 |         break;
 525 |     default:
 526 |         assert(0);
 527 |     }
 528 | }
 529 | 
 530 | 
 531 | static void
 532 | reader_dealloc(ReaderObject *self)
 533 | {
 534 |     reader_clear(self);
 535 |     delete self->reader;
 536 |     delete_cursor(self->cursor_type, self->cursor);
 537 |     Py_TYPE(self)->tp_free((PyObject *)self);
 538 | }
 539 | 
 540 | 
 541 | static int
 542 | reader_traverse(ReaderObject *self, visitproc visit, void *arg)
 543 | {
 544 |     Py_VISIT(self->py_row);
 545 |     return 0;
 546 | }
 547 | 
 548 | 
 549 | static int
 550 | header_from_first_row(ReaderObject *self)
 551 | {
 552 |     if(! self->reader->read_row()) {
 553 |         if(! PyErr_Occurred()) {
 554 |             PyErr_Format(PyExc_IOError, "Could not read header row");
 555 |         }
 556 |         return -1;
 557 |     }
 558 | 
 559 |     self->header_map = PyDict_New();
 560 |     if(! self->header_map) {
 561 |         return -1;
 562 |     }
 563 | 
 564 |     CsvCell *cell = &self->row->cells[0];
 565 |     for(int i = 0; i < self->row->count; i++) {
 566 |         PyObject *key = self->to_string(self, cell);
 567 |         PyObject *value = PyLong_FromLong(i);
 568 |         assert(key && value);
 569 |         PyDict_SetItem(self->header_map, key, value);
 570 |         Py_DECREF(key);
 571 |         Py_DECREF(value);
 572 |         cell++;
 573 |     }
 574 | 
 575 |     return 0;
 576 | }
 577 | 
 578 | 
 579 | static int
 580 | header_from_sequence(ReaderObject *self, PyObject *header)
 581 | {
 582 |     self->header_map = PyDict_New();
 583 |     if(! self->header_map) {
 584 |         return -1;
 585 |     }
 586 | 
 587 |     Py_ssize_t length = PySequence_Length(header);
 588 |     for(Py_ssize_t i = 0; i < length; i++) {
 589 |         PyObject *key = PySequence_GetItem(header, i);
 590 |         if(! key) {
 591 |             return -1;
 592 |         }
 593 | 
 594 |         PyObject *value = PyLong_FromLong(i);
 595 |         if(! value) {
 596 |             return -1;
 597 |         }
 598 | 
 599 |         PyDict_SetItem(self->header_map, key, value);
 600 |         Py_DECREF(key);
 601 |         Py_DECREF(value);
 602 |     }
 603 | 
 604 |     return 0;
 605 | }
 606 | 
 607 | 
 608 | static PyObject *
 609 | reader_from_cursor(CursorType cursor_type,
 610 |                    StreamCursor *cursor,
 611 |                    const char *yields,
 612 |                    PyObject *header,
 613 |                    char delimiter,
 614 |                    char quotechar,
 615 |                    char escapechar,
 616 |                    bool yield_incomplete_row,
 617 |                    const char *encoding,
 618 |                    const char *errors)
 619 | {
 620 |     ReaderObject *self = PyObject_GC_New(ReaderObject, &ReaderType);
 621 |     if(! self) {
 622 |         delete_cursor(cursor_type, cursor);
 623 |         Py_DECREF(self);
 624 |         return NULL;
 625 |     }
 626 | 
 627 |     self->cursor_type = cursor_type;
 628 |     self->cursor = cursor;
 629 |     self->record = 0;
 630 |     self->errors = errors;
 631 | 
 632 |     if(! strcmp(yields, "dict")) {
 633 |         self->yields = row_asdict;
 634 |     } else if(! strcmp(yields, "list")) {
 635 |         self->yields = row_aslist;
 636 |     } else if(! strcmp(yields, "tuple")) {
 637 |         self->yields = row_astuple;
 638 |     } else {
 639 |         self->yields = row_return_self;
 640 |     }
 641 | 
 642 |     self->header = header && PyObject_IsTrue(header);
 643 |     self->reader = new CsvReader<>(
 644 |         *self->cursor,
 645 |         delimiter,
 646 |         quotechar,
 647 |         escapechar,
 648 |         yield_incomplete_row
 649 |     );
 650 |     self->row = &self->reader->row();
 651 |     self->py_row = row_new(self);
 652 | 
 653 |     // Default to UTF-8 encoding on Python 3.
 654 | #if PY_MAJOR_VERSION >= 3
 655 |     if(! encoding) {
 656 |         encoding = "utf-8";
 657 |     }
 658 | #endif
 659 | 
 660 |     if((! encoding) || (! strcmp(encoding, "bytes"))) {
 661 |         self->to_string = cell_to_bytes;
 662 |     } else if(! strcmp(encoding, "utf-8")) {
 663 |         self->to_string = cell_to_utf8;
 664 |     } else if(! strcmp(encoding, "ascii")) {
 665 |         self->to_string = cell_to_ascii;
 666 |     } else if(! strcmp(encoding, "latin1")) {
 667 |         self->to_string = cell_to_latin1;
 668 | #ifdef HAS_LOCALE
 669 |     } else if(! strcmp(encoding, "locale")) {
 670 |         self->to_string = cell_to_locale;
 671 | #endif
 672 |     } else {
 673 |         self->encoding = encoding;
 674 |         self->to_string = cell_to_unicode;
 675 |     }
 676 | 
 677 |     if(self->header) {
 678 |         int rc;
 679 |         if(PySequence_Check(header)) {
 680 |             rc = header_from_sequence(self, header);
 681 |         } else {
 682 |             rc = header_from_first_row(self);
 683 |         }
 684 | 
 685 |         if(rc) {
 686 |             Py_DECREF((PyObject *) self);
 687 |             return NULL;
 688 |         }
 689 |     } else {
 690 |         self->header_map = NULL;
 691 |     }
 692 | 
 693 |     PyObject_GC_Track((PyObject *) self);
 694 |     return (PyObject *) self;
 695 | }
 696 | 
 697 | 
 698 | static PyObject *
 699 | reader_from_path(PyObject *_self, PyObject *args, PyObject *kw)
 700 | {
 701 |     static char *keywords[] = {"path", "yields", "header", "delimiter",
 702 |         "quotechar", "escapechar", "yield_incomplete_row",
 703 |         "encoding", "errors"};
 704 |     const char *path;
 705 |     const char *yields = "row";
 706 |     PyObject *header = NULL;
 707 |     char delimiter = ',';
 708 |     char quotechar = '"';
 709 |     char escapechar = 0;
 710 |     int yield_incomplete_row = 0;
 711 |     const char *encoding = 0;
 712 |     const char *errors = 0;
 713 | 
 714 |     if(! PyArg_ParseTupleAndKeywords(args, kw, "s|sOccciss:from_path", keywords,
 715 |             &path, &yields, &header, &delimiter, &quotechar, &escapechar,
 716 |             &yield_incomplete_row, &encoding, &errors)) {
 717 |         return NULL;
 718 |     }
 719 | 
 720 |     MappedFileCursor *cursor = new MappedFileCursor();
 721 |     try {
 722 |         cursor->open(path);
 723 |     } catch(csvmonkey::Error &e) {
 724 |         delete cursor;
 725 |         PyErr_Format(PyExc_IOError, "%s: %s", path, e.what());
 726 |         return NULL;
 727 |     }
 728 | 
 729 |     return reader_from_cursor(
 730 |         CURSOR_MAPPED_FILE,
 731 |         cursor,
 732 |         yields,
 733 |         header,
 734 |         delimiter,
 735 |         quotechar,
 736 |         escapechar,
 737 |         yield_incomplete_row,
 738 |         encoding,
 739 |         errors
 740 |     );
 741 | }
 742 | 
 743 | 
 744 | static PyObject *
 745 | reader_from_iter(PyObject *_self, PyObject *args, PyObject *kw)
 746 | {
 747 |     static char *keywords[] = {"iter", "yields", "header",
 748 |         "delimiter", "quotechar", "escapechar", "yield_incomplete_row",
 749 |         "encoding", "errors"};
 750 |     PyObject *iterable;
 751 |     const char *yields = "row";
 752 |     PyObject *header = NULL;
 753 |     char delimiter = ',';
 754 |     char quotechar = '"';
 755 |     char escapechar = 0;
 756 |     int yield_incomplete_row = 0;
 757 |     const char *encoding = 0;
 758 |     const char *errors = 0;
 759 | 
 760 |     if(! PyArg_ParseTupleAndKeywords(args, kw, "O|sOccciss:from_iter",
 761 |             keywords,
 762 |             &iterable, &yields, &header, &delimiter, &quotechar, &escapechar,
 763 |             &yield_incomplete_row, &encoding, &errors)) {
 764 |         return NULL;
 765 |     }
 766 | 
 767 |     PyObject *iter = PyObject_GetIter(iterable);
 768 |     if(! iter) {
 769 |         return NULL;
 770 |     }
 771 | 
 772 |     return reader_from_cursor(
 773 |         CURSOR_ITERATOR,
 774 |         new IteratorStreamCursor(iter),
 775 |         yields,
 776 |         header,
 777 |         delimiter,
 778 |         quotechar,
 779 |         escapechar,
 780 |         yield_incomplete_row,
 781 |         encoding,
 782 |         errors
 783 |     );
 784 | }
 785 | 
 786 | 
 787 | static PyObject *
 788 | reader_from_file(PyObject *_self, PyObject *args, PyObject *kw)
 789 | {
 790 |     static char *keywords[] = {"fp", "yields", "header",
 791 |         "delimiter", "quotechar", "escapechar", "yield_incomplete_row",
 792 |         "encoding", "errors"};
 793 |     PyObject *fp;
 794 |     const char *yields = "row";
 795 |     PyObject *header = NULL;
 796 |     char delimiter = ',';
 797 |     char quotechar = '"';
 798 |     char escapechar = 0;
 799 |     int yield_incomplete_row = 0;
 800 |     const char *encoding = 0;
 801 |     const char *errors = 0;
 802 | 
 803 |     if(! PyArg_ParseTupleAndKeywords(args, kw, "O|sOccciss:from_file", keywords,
 804 |             &fp, &yields, &header, &delimiter, &quotechar, &escapechar,
 805 |             &yield_incomplete_row, &encoding, &errors)) {
 806 |         return NULL;
 807 |     }
 808 | 
 809 |     PyObject *py_read = PyObject_GetAttrString(fp, "read");
 810 |     if(! py_read) {
 811 |         CSM_DEBUG("py_read is null");
 812 |         return NULL;
 813 |     }
 814 | 
 815 |     return reader_from_cursor(
 816 |         CURSOR_PYTHON_FILE,
 817 |         new FileStreamCursor(py_read),
 818 |         yields,
 819 |         header,
 820 |         delimiter,
 821 |         quotechar,
 822 |         escapechar,
 823 |         yield_incomplete_row,
 824 |         encoding,
 825 |         errors
 826 |     );
 827 | }
 828 | 
 829 | 
 830 | static PyObject *
 831 | reader_get_header(ReaderObject *self, PyObject *args)
 832 | {
 833 |     if(! self->header_map) {
 834 |         return PyList_New(0);
 835 |     }
 836 | 
 837 |     PyObject *lst = PyList_New(PyDict_Size(self->header_map));
 838 |     Py_ssize_t ppos = 0;
 839 |     PyObject *key;
 840 |     PyObject *value;
 841 | 
 842 |     while(PyDict_Next(self->header_map, &ppos, &key, &value)) {
 843 |         int i = PyLong_AsLong(value);
 844 |         PyList_SET_ITEM(lst, i, key);
 845 |         Py_INCREF(key);
 846 |     }
 847 | 
 848 |     return lst;
 849 | }
 850 | 
 851 | 
 852 | static PyObject *
 853 | reader_find_cell(ReaderObject *self, PyObject *args)
 854 | {
 855 |     const char *s;
 856 |     if(! PyArg_ParseTuple(args, "s:find_cell", &s)) {
 857 |         return NULL;
 858 |     }
 859 | 
 860 |     CsvCell *cell;
 861 |     if(! self->row->by_value(s, cell)) {
 862 |         PyErr_Format(PyExc_KeyError, "%s", s);
 863 |         return NULL;
 864 |     }
 865 | 
 866 |     return cell_new(self, cell);
 867 | }
 868 | 
 869 | 
 870 | static PyObject *
 871 | reader_repr(ReaderObject *self)
 872 | {
 873 |     return PyUnicode_FromFormat(
 874 |         "<csvmonkey._Reader positioned at record %d>",
 875 |         self->record
 876 |     );
 877 | }
 878 | 
 879 | 
 880 | static PyObject *
 881 | reader_iter(PyObject *self)
 882 | {
 883 |     Py_INCREF(self);
 884 |     return self;
 885 | }
 886 | 
 887 | 
 888 | static PyObject *
 889 | reader_iternext(ReaderObject *self)
 890 | {
 891 |     if(self->reader->read_row()) {
 892 |         self->record++;
 893 |         return self->yields((RowObject *) self->py_row);
 894 |     }
 895 | 
 896 |     if(self->cursor->size() && !self->reader->in_newline_skip) {
 897 |         PyErr_Format(PyExc_IOError,
 898 |             "%lu unparsed bytes at end of input. The input may be missing a "
 899 |             "final newline, or unbalanced quotes are present.",
 900 |             (unsigned long) self->cursor->size()
 901 |         );
 902 |     }
 903 |     if(! PyErr_Occurred()) {
 904 |         PyErr_SetNone(PyExc_StopIteration);
 905 |     }
 906 |     return NULL;
 907 | }
 908 | 
 909 | 
 910 | /*
 911 |  * Cell Type.
 912 |  */
 913 | 
 914 | static PyMethodDef cell_methods[] = {
 915 |     {"as_double",   (PyCFunction)cell_as_double, METH_NOARGS, ""},
 916 |     {"as_str",      (PyCFunction)cell_as_str, METH_NOARGS, ""},
 917 |     {"equals",      (PyCFunction)cell_equals, METH_VARARGS, ""},
 918 |     {0, 0, 0, 0}
 919 | };
 920 | 
 921 | PyTypeObject CellType = {
 922 |     PyVarObject_HEAD_INIT(NULL, 0)
 923 |     .tp_name = "_Cell",
 924 |     .tp_basicsize = sizeof(CellObject),
 925 |     .tp_dealloc = cell_dealloc,
 926 | #if PY_MAJOR_VERSION < 3
 927 |     .tp_compare = cell_compare,
 928 | #endif
 929 |     .tp_flags=Py_TPFLAGS_DEFAULT,
 930 |     .tp_doc="csvmonkey._Cell",
 931 |     .tp_richcompare=cell_richcmp,
 932 |     .tp_methods=cell_methods,
 933 | };
 934 | 
 935 | 
 936 | /*
 937 |  * Row type.
 938 |  */
 939 | 
 940 | static PySequenceMethods row_sequence_methods = {
 941 |     (lenfunc) row_getlength,  /* sq_length */
 942 |     NULL,                        /* sq_concat */
 943 |     NULL,                        /* sq_repeat */
 944 |     (ssizeargfunc) row_getitem,  /* sq_item */
 945 | };
 946 | 
 947 | 
 948 | static PyMappingMethods row_mapping_methods = {
 949 |     (lenfunc) row_getlength,      /* mp_length */
 950 |     (binaryfunc) row_subscript,   /* mp_subscript */
 951 |     0,                               /* mp_ass_subscript */
 952 | };
 953 | 
 954 | static PyMethodDef row_methods[] = {
 955 |     {"aslist",     (PyCFunction)row_aslist, METH_NOARGS, ""},
 956 |     {"astuple",    (PyCFunction)row_astuple, METH_NOARGS, ""},
 957 |     {"asdict",     (PyCFunction)row_asdict, METH_NOARGS, ""},
 958 |     {0, 0, 0, 0}
 959 | };
 960 | 
 961 | PyTypeObject RowType = {
 962 |     PyVarObject_HEAD_INIT(NULL, 0)
 963 |     "_Row",                     /*tp_name*/
 964 |     sizeof(RowObject),          /*tp_basicsize*/
 965 |     0,                          /*tp_itemsize*/
 966 |     (destructor) row_dealloc,   /*tp_dealloc*/
 967 |     0,                          /*tp_print*/
 968 |     0,                          /*tp_getattr*/
 969 |     0,                          /*tp_setattr*/
 970 |     0,                          /*tp_compare*/
 971 |     (reprfunc)row_repr,         /*tp_repr*/
 972 |     0,                          /*tp_as_number*/
 973 |     &row_sequence_methods,      /*tp_as_sequence*/
 974 |     &row_mapping_methods,       /*tp_as_mapping*/
 975 |     0,                          /*tp_hash*/
 976 |     0,                          /*tp_call*/
 977 |     0,                          /*tp_str*/
 978 |     0,                          /*tp_getattro*/
 979 |     0,                          /*tp_setattro*/
 980 |     0,                          /*tp_as_buffer*/
 981 |     Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_GC,         /*tp_flags*/
 982 |     "csvmonkey._Row",            /*tp_doc*/
 983 |     (traverseproc)row_traverse, /*tp_traverse*/
 984 |     (inquiry)row_clear,         /*tp_clear*/
 985 |     0,                          /*tp_richcompare*/
 986 |     0,                          /*tp_weaklistoffset*/
 987 |     (getiterfunc) row_iter,     /*tp_iter*/
 988 |     0,                          /*tp_iternext*/
 989 |     row_methods,                /*tp_methods*/
 990 |     0,                          /*tp_members*/
 991 |     0,                          /*tp_getset*/
 992 |     0,                          /*tp_base*/
 993 |     0,                          /*tp_dict*/
 994 |     0,                          /*tp_descr_get*/
 995 |     0,                          /*tp_descr_set*/
 996 |     0,                          /*tp_dictoffset*/
 997 |     0,                          /*tp_init*/
 998 |     0,                          /*tp_alloc*/
 999 |     0,                          /*tp_new*/
1000 |     0,                          /*tp_free*/
1001 | };
1002 | 
1003 | 
1004 | /*
1005 |  * Reader type.
1006 |  */
1007 | 
1008 | static PyMethodDef reader_methods[] = {
1009 |     {"get_header",  (PyCFunction)reader_get_header, METH_NOARGS, ""},
1010 |     {"find_cell",   (PyCFunction)reader_find_cell, METH_VARARGS, ""},
1011 |     {0, 0, 0, 0}
1012 | };
1013 | 
1014 | PyTypeObject ReaderType = {
1015 |     PyVarObject_HEAD_INIT(NULL, 0)
1016 |     "_Reader",                  /*tp_name*/
1017 |     sizeof(ReaderObject),       /*tp_basicsize*/
1018 |     0,                          /*tp_itemsize*/
1019 |     (destructor) reader_dealloc,/*tp_dealloc*/
1020 |     0,                          /*tp_print*/
1021 |     0,                          /*tp_getattr*/
1022 |     0,                          /*tp_setattr*/
1023 |     0,                          /*tp_compare*/
1024 |     (reprfunc)reader_repr,      /*tp_repr*/
1025 |     0,                          /*tp_as_number*/
1026 |     0,                          /*tp_as_sequence*/
1027 |     0,                          /*tp_as_mapping*/
1028 |     0,                          /*tp_hash*/
1029 |     0,                          /*tp_call*/
1030 |     0,                          /*tp_str*/
1031 |     0,                          /*tp_getattro*/
1032 |     0,                          /*tp_setattro*/
1033 |     0,                          /*tp_as_buffer*/
1034 |     Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_GC,         /*tp_flags*/
1035 |     "csvmonkey._Reader",         /*tp_doc*/
1036 |     (traverseproc)reader_traverse, /*tp_traverse*/
1037 |     (inquiry)reader_clear,      /*tp_clear*/
1038 |     0,                          /*tp_richcompare*/
1039 |     0,                          /*tp_weaklistoffset*/
1040 |     (getiterfunc) reader_iter,  /*tp_iter*/
1041 |     (iternextfunc) reader_iternext,  /*tp_iternext*/
1042 |     reader_methods,             /*tp_methods*/
1043 |     0,                          /*tp_members*/
1044 |     0,                          /*tp_getset*/
1045 |     0,                          /*tp_base*/
1046 |     0,                          /*tp_dict*/
1047 |     0,                          /*tp_descr_get*/
1048 |     0,                          /*tp_descr_set*/
1049 |     0,                          /*tp_dictoffset*/
1050 |     0,                          /*tp_init*/
1051 |     0,                          /*tp_alloc*/
1052 |     0,                          /*tp_new*/
1053 |     0,                          /*tp_free*/
1054 | };
1055 | 
1056 | 
1057 | /*
1058 |  * Module constructor.
1059 |  */
1060 | 
1061 | static struct PyMethodDef module_methods[] = {
1062 |     {"from_path", (PyCFunction) reader_from_path, METH_VARARGS|METH_KEYWORDS},
1063 |     {"from_iter", (PyCFunction) reader_from_iter, METH_VARARGS|METH_KEYWORDS},
1064 |     {"from_file", (PyCFunction) reader_from_file, METH_VARARGS|METH_KEYWORDS},
1065 |     {0, 0, 0, 0}
1066 | };
1067 | 
1068 | #if PY_MAJOR_VERSION >= 3
1069 | static struct PyModuleDef moduledef = {
1070 |     PyModuleDef_HEAD_INIT,
1071 |     "csvmonkey",
1072 |     NULL,
1073 |     -1,
1074 |     module_methods,
1075 |     NULL,
1076 |     NULL,
1077 |     NULL,
1078 |     NULL
1079 | };
1080 | #   define MOD_RETURN(mod) return mod;
1081 | #   define MODINIT_NAME PyInit_csvmonkey
1082 | #else
1083 | #   define MODINIT_NAME initcsvmonkey
1084 | #   define MOD_RETURN(mod) return
1085 | #endif
1086 | 
1087 | 
1088 | PyMODINIT_FUNC
1089 | MODINIT_NAME(void)
1090 | {
1091 |     static PyTypeObject *types[] = {
1092 |         &CellType, &RowType, &ReaderType
1093 |     };
1094 | 
1095 | #if PY_MAJOR_VERSION >= 3
1096 |     PyObject *mod = PyModule_Create(&moduledef);
1097 | #else
1098 |     PyObject *mod = Py_InitModule3("csvmonkey", module_methods, "");
1099 | #endif
1100 |     if(! mod) {
1101 |         MOD_RETURN(NULL);
1102 |     }
1103 | 
1104 |     for(int i = 0; i < (sizeof types / sizeof types[0]); i++) {
1105 |         PyTypeObject *type = types[i];
1106 |         if(PyType_Ready(type)) {
1107 |             MOD_RETURN(NULL);
1108 |         }
1109 |         if(PyModule_AddObject(mod, type->tp_name, (PyObject *)type)) {
1110 |             MOD_RETURN(NULL);
1111 |         }
1112 |     }
1113 | 
1114 |     MOD_RETURN(mod);
1115 | }
1116 | 


--------------------------------------------------------------------------------
/cpython/file_stream_cursor.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | class FileStreamCursor
 3 |     : public csvmonkey::BufferedStreamCursor
 4 | {
 5 |     PyObject *args_tuple_;
 6 |     PyObject *read_;
 7 | 
 8 |     public:
 9 |     FileStreamCursor(PyObject *read)
10 |         : BufferedStreamCursor()
11 |         , read_(read)
12 |         , args_tuple_(Py_BuildValue("(i)", 65536))
13 |     {
14 |         assert(args_tuple_ != 0);
15 |     }
16 | 
17 |     ~FileStreamCursor()
18 |     {
19 |         Py_DECREF(args_tuple_);
20 |         Py_DECREF(read_);
21 |     }
22 | 
23 |     virtual ssize_t readmore()
24 |     {
25 |         PyObject *result = PyObject_Call(read_, args_tuple_, NULL);
26 |         CSM_DEBUG("result = %lu", result);
27 |         if(! result) {
28 |             return -1;
29 |         }
30 | 
31 |         if(! PyBytes_CheckExact(result)) {
32 |             PyErr_SetString(PyExc_TypeError,
33 |                 "CSV iterable must yield exactly a string.");
34 |             Py_DECREF(result);
35 |             return -1;
36 |         }
37 | 
38 |         Py_ssize_t sz = PyBytes_GET_SIZE(result);
39 |         if(! sz) {
40 |             return -1;
41 |         }
42 | 
43 |         ensure(sz);
44 |         memcpy(&vec_[write_pos_], PyBytes_AS_STRING(result), sz);
45 |         Py_DECREF(result);
46 |         return sz;
47 |     }
48 | };
49 | 


--------------------------------------------------------------------------------
/cpython/iterator_stream_cursor.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | class IteratorStreamCursor
 3 |     : public csvmonkey::BufferedStreamCursor
 4 | {
 5 |     PyObject *iter_;
 6 | 
 7 |     public:
 8 |     IteratorStreamCursor(PyObject *iter)
 9 |         : BufferedStreamCursor()
10 |         , iter_(iter)
11 |     {
12 |     }
13 | 
14 |     ~IteratorStreamCursor()
15 |     {
16 |         Py_DECREF(iter_);
17 |     }
18 | 
19 |     virtual ssize_t readmore()
20 |     {
21 |         PyObject *result = PyIter_Next(iter_);
22 |         if(! result) {
23 |             return -1;
24 |         }
25 | 
26 |         if(! PyBytes_CheckExact(result)) {
27 |             PyErr_SetString(PyExc_TypeError,
28 |                 "CSV iterable must yield exactly a string.");
29 |             Py_DECREF(result);
30 |             return -1;
31 |         }
32 | 
33 |         Py_ssize_t sz = PyBytes_GET_SIZE(result);
34 |         if(! sz) {
35 |             return -1;
36 |         }
37 | 
38 |         ensure(sz);
39 |         memcpy(&vec_[write_pos_], PyBytes_AS_STRING(result), sz);
40 |         Py_DECREF(result);
41 |         return sz;
42 |     }
43 | };
44 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for Sphinx documentation
 2 | #
 3 | 
 4 | default:
 5 | 	sphinx-build . build/html/
 6 | 
 7 | # You can set these variables from the command line.
 8 | SPHINXOPTS    =
 9 | SPHINXBUILD   = sphinx-build
10 | PAPER         =
11 | BUILDDIR      = build
12 | 
13 | # User-friendly check for sphinx-build
14 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
15 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from https://sphinx-doc.org/)
16 | endif
17 | 
18 | # Internal variables.
19 | PAPEROPT_a4     = -D latex_paper_size=a4
20 | PAPEROPT_letter = -D latex_paper_size=letter
21 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
22 | # the i18n builder cannot share the environment and doctrees with the others
23 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
24 | 
25 | .PHONY: help
26 | help:
27 | 	@echo "Please use \`make <target>' where <target> is one of"
28 | 	@echo "  html       to make standalone HTML files"
29 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
30 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
31 | 	@echo "  linkcheck  to check all external links for integrity"
32 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
33 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
34 | 
35 | .PHONY: clean
36 | clean:
37 | 	rm -rf $(BUILDDIR)/*
38 | 
39 | .PHONY: html
40 | html:
41 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
42 | 	@echo
43 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
44 | 
45 | .PHONY: dirhtml
46 | dirhtml:
47 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
48 | 	@echo
49 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
50 | 
51 | .PHONY: changes
52 | changes:
53 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
54 | 	@echo
55 | 	@echo "The overview file is in $(BUILDDIR)/changes."
56 | 
57 | .PHONY: linkcheck
58 | linkcheck:
59 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
60 | 	@echo
61 | 	@echo "Link check complete; look for any errors in the above output " \
62 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
63 | 
64 | .PHONY: doctest
65 | doctest:
66 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
67 | 	@echo "Testing of doctests in the sources finished, look at the " \
68 | 	      "results in $(BUILDDIR)/doctest/output.txt."
69 | 
70 | .PHONY: coverage
71 | coverage:
72 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
73 | 	@echo "Testing of coverage in the sources finished, look at the " \
74 | 	      "results in $(BUILDDIR)/coverage/python.txt."
75 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | author = u'David Wilson'
 2 | copyright = u'2019, David Wilson'
 3 | exclude_patterns = ['build']
 4 | html_show_sourcelink = False
 5 | html_show_sphinx = False
 6 | html_sidebars = {'**': ['globaltoc.html', 'github.html']}
 7 | html_static_path = ['static']
 8 | html_theme_path = ['.']
 9 | html_theme = 'alabaster'
10 | html_theme_options = {
11 |     'font_family': "Georgia, serif",
12 |     'head_font_family': "Georgia, serif",
13 |     'fixed_sidebar': True,
14 |     'show_powered_by': False,
15 |     'pink_2': 'fffafaf',
16 |     'pink_1': '#fff0f0',
17 | }
18 | htmlhelp_basename = 'csvmonkeydoc'
19 | language = None
20 | master_doc = 'index'
21 | project = u'csvmonkey'
22 | pygments_style = 'sphinx'
23 | source_suffix = '.rst'
24 | templates_path = ['templates']
25 | todo_include_todos = False
26 | version = '0.0.2'
27 | release = version
28 | 


--------------------------------------------------------------------------------
/docs/cpp.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | C++ API
  3 | =======
  4 | 
  5 |  .. default-domain:: cpp
  6 | 
  7 | 
  8 | 
  9 | Errors
 10 | ------
 11 | 
 12 | .. class:: csvmonkey::Error : public std::exception
 13 | 
 14 |     Thrown in various places during setup, due to failed file IO or field
 15 |     extraction.
 16 | 
 17 |     .. function:: const char \*what() const
 18 | 
 19 |         Describe the reason for the error.
 20 | 
 21 | 
 22 | Cursors
 23 | -------
 24 | 
 25 | .. class:: csvmonkey::StreamCursor
 26 | 
 27 |     Abstract base for implementing iteration over an input stream.
 28 |     :class:`StreamCursor` is the interface used by :class:`CsvReader` to
 29 |     acquire and manage its input buffer.
 30 | 
 31 |     .. function:: virtual const char \*buf() = 0
 32 | 
 33 |         Current stream position. Must guarantee access to
 34 |         `buf()[0..size()+31]`, with 31 trailing NULs to allow safely running
 35 |         ``PCMPSTRI`` on the final data byte.
 36 | 
 37 |     .. function:: virtual size_t size() = 0
 38 | 
 39 |         Size of the buffer pointed to by :func:`buf`.
 40 | 
 41 |     .. function:: virtual void consume(size_t n) = 0
 42 | 
 43 |         Called by :class:`CsvReader` to indicate `n` bytes from the front of
 44 |         :func:`buf` have been consumed. The return value of :func:`buf` and
 45 |         :func:`size` should now reflect a buffer positioned on the first byte
 46 |         following `n`.
 47 | 
 48 |     .. function:: virtual bool fill() = 0
 49 | 
 50 |         Called by :class:`CsvReader` to request more input. This function
 51 |         returns `true` to indicate the buffer provided by :func:`buf` and
 52 |         :func:`size` has been extended, or `false` to indicate EOF.
 53 | 
 54 | 
 55 | .. class:: csvmonkey::MappedFileCursor : public StreamCursor
 56 | 
 57 |     Implement zero-copy input using a memory-mapped file.
 58 | 
 59 |     .. function:: void open(const char \*filename)
 60 | 
 61 |         Open `filename` for reading. Throws :class:`Error` on failure.
 62 | 
 63 | 
 64 | .. class:: csvmonkey::BufferedStreamCursor : public StreamCursor
 65 | 
 66 |     Base class for any cursor implementation that requires buffering.
 67 | 
 68 |     .. member:: std::vector<char> vec_
 69 | 
 70 |         The buffer
 71 | 
 72 |     .. member:: size_t write_pos_
 73 | 
 74 |         Current write offset within the buffer. New data appended to
 75 |         :member:`vec_` by :func:`readmore` should append past `write_pos_`.
 76 | 
 77 |     .. function:: void ensure(size_t capacity)
 78 | 
 79 |         Ensure at least `capacity` additional bytes are available in the buffer
 80 |         starting at the current write position.
 81 | 
 82 |     .. function:: virtual ssize_t readmore() = 0
 83 | 
 84 |         Arrange for more data to fill the buffer. Your implementation should
 85 |         issue some IO request and copy the result into `vec_[write_pos_:]`.
 86 |         The function should return -1 on error, 0 on EOF, or nonzero to
 87 |         indicate how many bytes were appended.
 88 | 
 89 | 
 90 | .. class:: csvmonkey::FdStreamCursor : public BufferedStreamCursor
 91 | 
 92 |     Implement buffered input from a UNIX file descriptor.
 93 | 
 94 |     .. function:: FdStreamCursor(int fd)
 95 | 
 96 |         Construct a new instance using `fd`.
 97 | 
 98 | 
 99 | CsvCell
100 | -------
101 | 
102 | .. class:: csvmonkey::CsvCell
103 | 
104 |     Descriptor for a single parsed CSV field.
105 | 
106 |     Cells describe fields in terms of references to :func:`StreamCursor::buf`,
107 |     and thus become invalid once the underlying stream cursor is mutated.
108 |     :class:`CsvReader` reuses a single vector of cells throughout the run,
109 |     therefore any cell returned after a successful :func:`CsvReader::parse_row`
110 |     call are invalidated by the next call to :func:`CsvReader::parse_row`.
111 |   
112 |     .. member:: const char \*ptr
113 | 
114 |         Pointer to the start of the CSV field.
115 |  
116 |     .. member:: size_t size
117 | 
118 |         Size of the CSV field.
119 | 
120 |     .. member:: char escapechar
121 | 
122 |         Escape character configured for the :class:`CsvReader`.
123 | 
124 |     .. member:: char quotechar
125 | 
126 |         Quote character configured for the :class:`CsvReader`.
127 | 
128 |     .. member:: bool escaped
129 | 
130 |         If `true`, at least one escape character exists in the field. Its
131 |         value must be accessed via :func:`CsvCell::as_str`.
132 | 
133 |     .. function:: std::string as_str()
134 | 
135 |         Return a string with the any quote and escapes decoded.
136 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Table of Contents
 3 | ==================
 4 | 
 5 | 
 6 | .. toctree::
 7 | 
 8 |     python
 9 |     cpp
10 | 


--------------------------------------------------------------------------------
/docs/python.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Python API
 3 | ==========
 4 | 
 5 | 
 6 | 
 7 | Factory Functions
 8 | -----------------
 9 | 
10 | 
11 | Common Arguments
12 | ^^^^^^^^^^^^^^^^
13 | 
14 | * **yields**: Specify the kind of value returned during iteration.
15 | 
16 |   * `row`: Cause a :class:`Row` to be yielded. Rows are dict/sequence-like
17 |     objects that support lazy decoding.
18 |   * `list`: Cause a fully decoded list to be yielded.
19 |   * `tuple`: Cause a fully decoded tuple to be yielded.
20 |   * `dict`: Cause a fully decoded dict to be yielded, in the style of
21 |     :class:`csv.DictReader`.
22 | 
23 | * **header**: Specify whether a header row exists, or specifies an explicit set
24 |   of column names. The header row is used to form keys available via the
25 |   :class:`Row` object, or for constructing dicts. May be any of:
26 | 
27 |   * :data:`True`: a header row exists and should be read away during
28 |     construction.
29 |   * :data:`False`: no header row exists.
30 | 
31 | 
32 | :param: header
33 |     Foo bar baz.
34 | :param: delimiter
35 |     Foo bar baz.
36 | :param: quotechar
37 |     Foo bar baz.
38 | :param: escapechar
39 |     Foo bar baz.
40 | :param bool: yield_incomplete_row
41 |     Foo bar baz.
42 | :param: encoding
43 |     Name of the encoding
44 | :param str: errors
45 |     One of "strict", "ignore" or "replace".
46 | 
47 | 
48 | 
49 | 
50 | .. function:: from_iter
51 | 
52 | 
53 | 
54 | .. function:: from_file
55 | .. function:: from_path
56 | 
57 | 


--------------------------------------------------------------------------------
/docs/static/.empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dw/csvmonkey/dc621253348e3cb353c3641dfbf2193c276f6dfe/docs/static/.empty


--------------------------------------------------------------------------------
/docs/templates/github.html:
--------------------------------------------------------------------------------
1 | <p>
2 | <br>
3 | <a class="github-button" href="https://github.com/dw/csvmonkey/" data-size="large" data-show-count="true" aria-label="Star dw/csvmonkey on GitHub">Star</a>
4 | </p>
5 | 


--------------------------------------------------------------------------------
/docs/templates/globaltoc.html:
--------------------------------------------------------------------------------
1 | {{ toctree() }}
2 | 


--------------------------------------------------------------------------------
/docs/templates/layout.html:
--------------------------------------------------------------------------------
 1 | {% extends "!layout.html" %}
 2 | {% set css_files = css_files + ['_static/style.css'] %}
 3 | 
 4 | {# We don't support Sphinx search, so don't let its JS either. #}
 5 | {% block scripts %}
 6 | {% endblock %}
 7 | 
 8 | {# Alabaster ships a completely useless custom.css, suppress it. #}
 9 | {%- block extrahead %}
10 |     <meta name="google-site-verification" content="oq5hNxRYo25tcfjfs3l6pPxfNgY3JzDYSpskc9q4TYI" />
11 |     <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
12 | {% endblock %}
13 | 
14 | {% block footer %}
15 |     {{ super() }}
16 | 
17 |     <script>
18 |       (function() {
19 |         {% include "piwik-config.js" %}
20 |         var u="https://networkgenomics.com/p/tr/";
21 |         _paq.push(['setTrackerUrl', u+'ep']);
22 |         var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; g.type='text/javascript';
23 |         g.defer=true; g.async=true; g.src=u+'js'; s.parentNode.insertBefore(g,s);
24 |       })();
25 |     </script>
26 | 
27 |     <noscript>
28 |         <p>
29 |         {% set fulltitle = (title|striptags|e) + titlesuffix -%}
30 |         <img src="https://networkgenomics.com/p/tr/ep?idsite=6&action_name={{fulltitle}}" style="border:0" alt="">
31 |         </p>
32 |     </noscript>
33 | 
34 |     <script async defer src="https://buttons.github.io/buttons.js"></script>
35 | {% endblock %}
36 | 


--------------------------------------------------------------------------------
/docs/templates/piwik-config.js:
--------------------------------------------------------------------------------
1 | window._paq = [];
2 | window._paq.push(['trackPageView']);
3 | window._paq.push(['enableLinkTracking']);
4 | window._paq.push(['enableHeartBeatTimer', 30]);
5 | window._paq.push(['setSiteId', 6]);
6 | 


--------------------------------------------------------------------------------
/include/csvmonkey.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <algorithm>
  3 | #include <cassert>
  4 | #include <cerrno>
  5 | #include <cstring>
  6 | #include <exception>
  7 | #include <fcntl.h>
  8 | #include <fstream>
  9 | #include <iostream>
 10 | #include <stdexcept>
 11 | #include <stdlib.h>
 12 | #include <sys/mman.h>
 13 | #include <sys/stat.h>
 14 | #include <sys/types.h>
 15 | #include <unistd.h>
 16 | #include <vector>
 17 | 
 18 | #if defined(__SSE4_2__) && !defined(CSM_IGNORE_SSE42)
 19 | #define CSM_USE_SSE42
 20 | #include <emmintrin.h>
 21 | #include <smmintrin.h>
 22 | #endif // __SSE4_2__
 23 | 
 24 | #ifdef USE_SPIRIT
 25 | #include "boost/spirit/include/qi.hpp"
 26 | #endif
 27 | 
 28 | 
 29 | #ifdef CSVMONKEY_DEBUG
 30 | #   define CSM_DEBUG(x, ...) fprintf(stderr, "csvmonkey: " x "\n", ##__VA_ARGS__);
 31 | #else
 32 | #   define CSM_DEBUG(x...) {}
 33 | #endif
 34 | 
 35 | 
 36 | namespace csvmonkey {
 37 | 
 38 | 
 39 | class StreamCursor;
 40 | 
 41 | 
 42 | template<class StreamCursorType=StreamCursor>
 43 | class CsvReader;
 44 | 
 45 | 
 46 | class Error : public std::exception
 47 | {
 48 |     std::string s_;
 49 | 
 50 |     public:
 51 |     Error(const char *category, const std::string &s)
 52 |         : s_(category)
 53 |     {
 54 |         s_.append(": ");
 55 |         s_.append(s);
 56 |     }
 57 | 
 58 |     virtual const char *
 59 |     what() const throw()
 60 |     {
 61 |         return s_.c_str();
 62 |     }
 63 | };
 64 | 
 65 | 
 66 | class StreamCursor
 67 | {
 68 |     public:
 69 |     /**
 70 |      * Current stream position. Must guarantee access to buf()[0..size()+15],
 71 |      * with 31 trailing NULs to allow safely running PCMPSTRI on the final data
 72 |      * byte.
 73 |      */
 74 |     virtual const char *buf() = 0;
 75 |     virtual size_t size() = 0;
 76 |     virtual void consume(size_t n) = 0;
 77 |     virtual bool fill() = 0;
 78 | };
 79 | 
 80 | 
 81 | class MappedFileCursor
 82 |     : public StreamCursor
 83 | {
 84 |     char *startp_;
 85 |     char *endp_;
 86 |     char *p_;
 87 |     char *guardp_;
 88 | 
 89 |     size_t get_page_size()
 90 |     {
 91 |         return (size_t) sysconf(_SC_PAGESIZE);
 92 |     }
 93 | 
 94 |     public:
 95 |     MappedFileCursor()
 96 |         : startp_(0)
 97 |         , endp_(0)
 98 |         , p_(0)
 99 |         , guardp_(0)
100 |     {
101 |     }
102 | 
103 |     ~MappedFileCursor()
104 |     {
105 |         if(startp_) {
106 |             ::munmap(startp_, endp_ - startp_);
107 |         }
108 |         if(guardp_) {
109 |             ::munmap(guardp_, get_page_size());
110 |         }
111 |     }
112 | 
113 |     const char *buf()
114 |     {
115 |         return p_;
116 |     }
117 | 
118 |     size_t size()
119 |     {
120 |         return endp_ - p_;
121 |     }
122 | 
123 |     void consume(size_t n)
124 |     {
125 |         p_ += std::min(n, (size_t) (endp_ - p_));
126 |         CSM_DEBUG("consume(%lu); new size: %lu", n, size())
127 |     }
128 | 
129 |     bool fill()
130 |     {
131 |         return false;
132 |     }
133 | 
134 |     void open(const char *filename)
135 |     {
136 |         int fd = ::open(filename, O_RDONLY);
137 |         if(fd == -1) {
138 |             throw Error(filename, strerror(errno));
139 |         }
140 | 
141 |         struct stat st;
142 |         if(fstat(fd, &st) == -1) {
143 |             ::close(fd);
144 |             throw Error("fstat", strerror(errno));
145 |         }
146 | 
147 |         // UNIX sucks. We can't use MAP_FIXED to ensure a guard page appears
148 |         // after the file data because it'll silently overwrite mappings for
149 |         // unrelated stuff in RAM (causing bizarro unrelated errors and
150 |         // segfaults). We can't rely on the kernel's map placement behaviour
151 |         // because it varies depending on the size of the mapping (guard page
152 |         // ends up sandwiched between .so mappings, data file ends up at bottom
153 |         // of range with no space left before first .so). We can't parse
154 |         // /proc/self/maps because that sucks and is nonportable and racy. We
155 |         // could use random addresses pumped into posix_mem_offset() but that
156 |         // is insane and likely slow and non-portable and racy.
157 |         //
158 |         // So that leaves us with: make a MAP_ANON mapping the size of the
159 |         // datafile + the guard page, leaving the kernel to pick addresses,
160 |         // then use MAP_FIXED to overwrite it. We can't avoid the MAP_FIXED
161 |         // since there would otherwise be a race between the time we
162 |         // mmap/munmap to find a usable address range, and another thread
163 |         // performing the same operation. So here we exploit crap UNIX
164 |         // semantics to avoid a race.
165 | 
166 |         unsigned long page_size = get_page_size();
167 |         unsigned long page_mask = page_size - 1;
168 |         size_t rounded = (st.st_size & page_mask)
169 |             ? ((st.st_size & ~page_mask) + page_size)
170 |             : st.st_size;
171 | 
172 |         auto startp = (char *) mmap(0, rounded+page_size, PROT_READ,
173 |                                     MAP_ANON|MAP_PRIVATE, 0, 0);
174 |         if(! startp) {
175 |             ::close(fd);
176 |             throw Error("mmap", "could not allocate guard page");
177 |         }
178 | 
179 |         guardp_ = startp + rounded;
180 |         startp_ = (char *) mmap(startp, st.st_size, PROT_READ,
181 |                                 MAP_SHARED|MAP_FIXED, fd, 0);
182 |         ::close(fd);
183 | 
184 |         if(startp_ != startp) {
185 |             CSM_DEBUG("could not place data below guard page (%p) at %p, got %p.",
186 |                   guardp_, startp, startp_);
187 |             throw Error("mmap", "could not place data below guard page");
188 |         }
189 | 
190 |         ::madvise(startp_, st.st_size, MADV_SEQUENTIAL);
191 |         ::madvise(startp_, st.st_size, MADV_WILLNEED);
192 |         endp_ = startp_ + st.st_size;
193 |         p_ = startp_;
194 |     }
195 | };
196 | 
197 | 
198 | class BufferedStreamCursor
199 |     : public StreamCursor
200 | {
201 |     protected:
202 |     std::vector<char> vec_;
203 |     size_t read_pos_;
204 |     size_t write_pos_;
205 | 
206 |     virtual ssize_t readmore() = 0;
207 | 
208 |     BufferedStreamCursor()
209 |         : vec_(131072)
210 |         , read_pos_(0)
211 |         , write_pos_(0)
212 |     {
213 |     }
214 | 
215 |     protected:
216 |     void ensure(size_t capacity)
217 |     {
218 |         size_t available = vec_.size() - write_pos_;
219 |         if(available < capacity) {
220 |             CSM_DEBUG("resizing vec_ %lu", (size_t)(vec_.size() + capacity));
221 |             vec_.resize(32 + (vec_.size() + capacity));
222 |         }
223 |     }
224 | 
225 |     public:
226 |     const char *buf()
227 |     {
228 |         return &vec_[0] + read_pos_;
229 |     }
230 | 
231 |     size_t size()
232 |     {
233 |         return write_pos_ - read_pos_;
234 |     }
235 | 
236 |     void consume(size_t n)
237 |     {
238 |         read_pos_ += std::min(n, write_pos_ - read_pos_);
239 |         CSM_DEBUG("consume(%lu); new size: %lu", n, size())
240 |     }
241 | 
242 |     virtual bool fill()
243 |     {
244 |         if(read_pos_) {
245 |             size_t n = write_pos_ - read_pos_;
246 |             CSM_DEBUG("read_pos_ needs adjust, it is %lu / n = %lu", read_pos_, n);
247 |             memcpy(&vec_[0], &vec_[read_pos_], n);
248 |             CSM_DEBUG("fill() adjust old write_pos = %lu", write_pos_);
249 |             write_pos_ -= read_pos_;
250 |             read_pos_ = 0;
251 |             CSM_DEBUG("fill() adjust new write_pos = %lu", write_pos_);
252 |         }
253 | 
254 |         if(write_pos_ == vec_.size()) {
255 |             ensure(vec_.size() / 2);
256 |         }
257 | 
258 |         ssize_t rc = readmore();
259 |         if(rc == -1) {
260 |             CSM_DEBUG("readmore() failed");
261 |             return false;
262 |         }
263 | 
264 |         CSM_DEBUG("readmore() succeeded")
265 |         CSM_DEBUG("fill() old write_pos = %lu", write_pos_);
266 |         write_pos_ += rc;
267 |         CSM_DEBUG("fill() new write_pos = %lu", write_pos_);
268 |         return write_pos_ > 0;
269 |     }
270 | };
271 | 
272 | 
273 | class FdStreamCursor
274 |     : public BufferedStreamCursor
275 | {
276 |     int fd_;
277 | 
278 |     public:
279 |     FdStreamCursor(int fd)
280 |         : BufferedStreamCursor()
281 |         , fd_(fd)
282 |     {
283 |     }
284 | 
285 |     virtual ssize_t readmore()
286 |     {
287 |         return ::read(fd_, &vec_[write_pos_], vec_.size() - write_pos_);
288 |     }
289 | };
290 | 
291 | 
292 | struct CsvCell
293 | {
294 |     const char *ptr;
295 |     size_t size;
296 | 
297 |     char escapechar;
298 |     char quotechar;
299 |     bool escaped;
300 | 
301 |     std::string as_str()
302 |     {
303 |         auto s = std::string(ptr, size);
304 |         if(escaped) {
305 |             int o = 0;
306 |             for(size_t i = 0; i < s.size();) {
307 |                 char c = s[i];
308 |                 if((escapechar && c == escapechar) || (c == quotechar)) {
309 |                     i++;
310 |                 }
311 |                 s[o++] = s[i++];
312 |             }
313 |             s.resize(o);
314 |         }
315 |         return s;
316 |     }
317 | 
318 |     bool startswith(const char *str) const
319 |     {
320 |         return std::string(ptr, std::min(size, strlen(str))) == str;
321 |     }
322 | 
323 |     bool equals(const char *str) const
324 |     {
325 |         auto p = ptr;
326 |         for(auto len = size; len--;) {
327 |             if((! *p) || *str++ != *p++) {
328 |                 return false;
329 |             }
330 |         }
331 |         return true;
332 |     }
333 | 
334 |     double as_double()
335 |     {
336 | #ifdef USE_SPIRIT
337 |         namespace qi = boost::spirit::qi;
338 |         using qi::double_;
339 |         double n;
340 |         qi::parse(ptr, ptr+size, double_, n);
341 |         return n;
342 | #else
343 |         return strtod(ptr, NULL);
344 | #endif
345 |     }
346 | };
347 | 
348 | 
349 | struct FieldPair
350 | {
351 |     const char *name;
352 |     CsvCell **cell;
353 | };
354 | 
355 | 
356 | #ifndef CSM_USE_SSE42
357 | #warning Using non-SSE4.2 fallback implementation.
358 | /**
359 |  * Callable that matches a set of up to 5 bytes (including NUL) in a 16 byte
360 |  * string. The index 0..15 of the first occurrence is returned, otherwise 16 is
361 |  * returned if no match is found or NUL is encountered.
362 |  */
363 | struct StringSpannerFallback
364 | {
365 |     uint8_t charset_[256];
366 | 
367 |     StringSpannerFallback(char c1=0, char c2=0, char c3=0, char c4=0)
368 |     {
369 |         ::memset(charset_, 0, sizeof charset_);
370 |         charset_[(unsigned) c1] = 1;
371 |         charset_[(unsigned) c2] = 1;
372 |         charset_[(unsigned) c3] = 1;
373 |         charset_[(unsigned) c4] = 1;
374 |         charset_[0] = 1;
375 |     }
376 | 
377 |     size_t
378 |     operator()(const char *s)
379 |         __attribute__((__always_inline__))
380 |     {
381 |         CSM_DEBUG("bitfield[32] = %d", charset_[32]);
382 |         CSM_DEBUG("span[0] = {%d,%d,%d,%d,%d,%d,%d,%d}",
383 |             s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]);
384 |         CSM_DEBUG("span[1] = {%d,%d,%d,%d,%d,%d,%d,%d}",
385 |             s[8], s[9], s[10], s[11], s[12], s[13], s[14], s[15]);
386 | 
387 |         auto p = (const unsigned char *)s;
388 |         auto e = p + 16;
389 | 
390 |         do {
391 |             if(charset_[p[0]]) {
392 |                 break;
393 |             }
394 |             if(charset_[p[1]]) {
395 |                 p++;
396 |                 break;
397 |             }
398 |             if(charset_[p[2]]) {
399 |                 p += 2;
400 |                 break;
401 |             }
402 |             if(charset_[p[3]]) {
403 |                 p += 3;
404 |                 break;
405 |             }
406 |             p += 4;
407 |         } while(p < e);
408 | 
409 |         if(! *p) {
410 |             return 16; // PCMPISTRI reports NUL encountered as no match.
411 |         }
412 | 
413 |         return p - (const unsigned char *)s;
414 |     }
415 | };
416 | 
417 | using StringSpanner = StringSpannerFallback;
418 | #   define CSM_ATTR_SSE42
419 | #endif // !CSM_USE_SSE42
420 | 
421 | 
422 | #ifdef CSM_USE_SSE42
423 | struct alignas(16) StringSpannerSse42
424 | {
425 |     __m128i v_;
426 | 
427 |     StringSpannerSse42(char c1=0, char c2=0, char c3=0, char c4=0)
428 |     {
429 |         assert(! ((reinterpret_cast<intptr_t>(&v_) & 15)));
430 |         __v16qi vq = {c1, c2, c3, c4};
431 |         v_ = (__m128i) vq;
432 |     }
433 | 
434 |     size_t __attribute__((__always_inline__, target("sse4.2")))
435 |     operator()(const char *buf)
436 |     {
437 |         return _mm_cmpistri(
438 |             v_,
439 |             _mm_loadu_si128((__m128i *) buf),
440 |             0
441 |         );
442 |     }
443 | };
444 | 
445 | using StringSpanner = StringSpannerSse42;
446 | #   define CSM_ATTR_SSE42 __attribute__((target("sse4.2")))
447 | #endif // CSM_USE_SSE42
448 | 
449 | 
450 | class CsvCursor
451 | {
452 |     public:
453 |     std::vector<CsvCell> cells;
454 |     size_t count;
455 | 
456 |     CsvCursor()
457 |         : cells()
458 |         , count(0)
459 |     {
460 |     }
461 | 
462 |     bool
463 |     by_value(const std::string &value, CsvCell *&cell)
464 |     {
465 |         for(size_t i = 0; i < count; i++) {
466 |             if(value == cells[i].as_str()) {
467 |                 cell = &cells[i];
468 |                 return true;
469 |             }
470 |         }
471 |         return false;
472 |     }
473 | };
474 | 
475 | 
476 | template<class StreamCursorType>
477 | class alignas(16) CsvReader
478 | {
479 |     const char *endp_;
480 |     const char *p_;
481 |     char delimiter_;
482 |     char quotechar_;
483 |     char escapechar_;
484 |     bool yield_incomplete_row_;
485 | 
486 |     public:
487 |     bool in_newline_skip;
488 | 
489 |     private:
490 |     StreamCursorType &stream_;
491 |     StringSpanner quoted_cell_spanner_;
492 |     StringSpanner unquoted_cell_spanner_;
493 |     CsvCursor row_;
494 | 
495 |     enum CsmTryParseReturnType {
496 |         kCsmTryParseOkay,
497 |         kCsmTryParseOverflow,
498 |         kCsmTryParseUnderrun
499 |     };
500 | 
501 |     CsmTryParseReturnType
502 |     try_parse()
503 |         CSM_ATTR_SSE42
504 |     {
505 |         const char *p = p_;
506 |         const char *cell_start;
507 |         int rc, rc2;
508 | 
509 |         CsvCell *cell = &row_.cells[0];
510 |         row_.count = 0;
511 | 
512 |         #define PREAMBLE() \
513 |             if(p >= endp_) {\
514 |                 CSM_DEBUG("pos exceeds size"); \
515 |                 return kCsmTryParseUnderrun; \
516 |             } \
517 |             CSM_DEBUG("p = %#p; remain = %ld; next char is: %d", p, endp_-p, (int)*p) \
518 |             CSM_DEBUG("%d: distance to next newline: %d", __LINE__, strchr(p, '\n') - p);
519 | 
520 |         #define NEXT_CELL() \
521 |             ++cell; \
522 |             if(row_.count == row_.cells.size()) { \
523 |                 CSM_DEBUG("cell array overflow"); \
524 |                 return kCsmTryParseOverflow; \
525 |             }
526 | 
527 |         CSM_DEBUG("remain = %lu", endp_ - p);
528 |         CSM_DEBUG("ch = %d %c", (int) *p, *p);
529 | 
530 |     newline_skip:
531 |         /*
532 |          * Skip newlines appearing at the start of the line, which may be a
533 |          * result of DOS/MAC-formatted input. Or a double-spaced CSV file.
534 |          */
535 |         in_newline_skip = true;
536 |         PREAMBLE()
537 |         if(*p == '\r' || *p == '\n') {
538 |             ++p;
539 |             goto newline_skip;
540 |         }
541 | 
542 |     cell_start:
543 |         in_newline_skip = false;
544 |         PREAMBLE()
545 |         cell->escaped = false;
546 |         if(*p == '\r' || *p == '\n') {
547 |             /*
548 |              * A newline appearing after at least one cell has been read
549 |              * indicates the presence of a single comma demarcating an unquoted
550 |              * unquoted unquoted unquoted empty final field.
551 |              */
552 |             cell->ptr = 0;
553 |             cell->size = 0;
554 |             ++row_.count;
555 |             p_ = p + 1;
556 |             return kCsmTryParseOkay;
557 |         } else if(*p == quotechar_) {
558 |             cell_start = ++p;
559 |             goto in_quoted_cell;
560 |         } else {
561 |             cell_start = p;
562 |             goto in_unquoted_cell;
563 |         }
564 | 
565 |     in_quoted_cell:
566 |         PREAMBLE()
567 |         rc = quoted_cell_spanner_(p);
568 |         rc2 = quoted_cell_spanner_(p+16);
569 |         if(rc != 16) {
570 |             p += rc + 1;
571 |             goto in_escape_or_end_of_quoted_cell;
572 |         }
573 | 
574 |         switch(rc2) {
575 |             case 16:
576 |                 p += 32;
577 |                 goto in_quoted_cell;
578 |             default:
579 |                 p += rc2 + 1 + 16;
580 |                 goto in_escape_or_end_of_quoted_cell;
581 |         }
582 | 
583 |     in_escape_or_end_of_quoted_cell:
584 |         PREAMBLE()
585 |         if(*p == delimiter_) {
586 |             cell->ptr = cell_start;
587 |             cell->size = p - cell_start - 1;
588 |             ++row_.count;
589 |             NEXT_CELL();
590 |             ++p;
591 |             goto cell_start;
592 |         } else if(*p == '\r' || *p == '\n') {
593 |             cell->ptr = cell_start;
594 |             cell->size = p - cell_start - 1;
595 |             ++row_.count;
596 |             p_ = p + 1;
597 |             return kCsmTryParseOkay;
598 |         } else {
599 |             cell->escaped = true;
600 |             ++p;
601 |             goto in_quoted_cell;
602 |         }
603 | 
604 |     in_unquoted_cell:
605 |         CSM_DEBUG("\n\nin_unquoted_cell")
606 |         PREAMBLE()
607 |         rc = unquoted_cell_spanner_(p);
608 |         CSM_DEBUG("unquoted span: %d; p[3]=%d p[..17]='%.17s'", rc, p[3], p);
609 |         rc2 = unquoted_cell_spanner_(p+16);
610 |         if(rc != 16) {
611 |             p += rc;
612 |             goto in_escape_or_end_of_unquoted_cell;
613 |         }
614 | 
615 |         switch(rc2) {
616 |         case 16:
617 |             p += 32;
618 |             goto in_unquoted_cell;
619 |         default:
620 |             p += rc2 + 16;
621 |             goto in_escape_or_end_of_unquoted_cell;
622 |         }
623 | 
624 |     in_escape_or_end_of_unquoted_cell:
625 |         PREAMBLE()
626 |         if(*p == delimiter_) {
627 |             cell->ptr = cell_start;
628 |             cell->size = p - cell_start;
629 |             ++row_.count;
630 |             CSM_DEBUG("in_escape_or_end_of_unquoted_cell(DELIMITER)")
631 |             CSM_DEBUG("p[..17] = '%.17s'", p)
632 |             CSM_DEBUG("done cell: '%.*s'", (int)cell->size, cell->ptr)
633 |             NEXT_CELL();
634 |             ++p;
635 |             goto cell_start;
636 |         } else if(*p == '\r' || *p == '\n') {
637 |             CSM_DEBUG("in_escape_or_end_of_unquoted_cell(NEWLINE)")
638 |             cell->ptr = cell_start;
639 |             cell->size = p - cell_start;
640 |             ++row_.count;
641 |             p_ = p + 1;
642 |             return kCsmTryParseOkay;
643 |         } else {
644 |             cell->escaped = true;
645 |             ++p;
646 |             goto in_unquoted_cell;
647 |         }
648 | 
649 |         CSM_DEBUG("error out");
650 |         return kCsmTryParseUnderrun;
651 |     }
652 | 
653 |     #undef PREAMBLE
654 |     #undef NEXT_CELL
655 | 
656 |     public:
657 | 
658 |     /**
659 |      * Extract CsvCell pointers to fields with a particular value. Used as a
660 |      * convenience for parsing the header row into a list of desired columns.
661 |      * Throws csvmonkey::Error if a desired column is not found in the row.
662 |      *
663 |      * @example
664 |      *      CsvCell *resource_id;
665 |      *      CsvCell *item_description;
666 |      *
667 |      *      if(! reader.read_row()) {
668 |      *          throw Error("cannot parse header row");
669 |      *      }
670 |      *
671 |      *      reader.extract_fields({
672 |      *          {"ResourceId", &resource_id},
673 |      *          {"ItemDescription", &item_description},
674 |      *      });
675 |      */
676 |     void
677 |     extract_fields(const std::vector<FieldPair> &pairs)
678 |     {
679 |         for(const auto &pair : pairs) {
680 |             if(! row_.by_value(pair.name, *pair.cell)) {
681 |                 std::string e("Could not find required header: ");
682 |                 e.append(pair.name);
683 |                 throw Error("extract_fields", e);
684 |             }
685 |         }
686 |     }
687 | 
688 |     void
689 |     _resize()
690 |     {
691 |         auto &cells = row_.cells;
692 |         auto size = cells.size() * 2;
693 |         if(! size) {
694 |             size = 32;
695 |         }
696 | 
697 |         cells.resize(size);
698 |         // For as_str()
699 |         for(size_t i = 0; i < size; i++) {
700 |             CsvCell &cell = cells[i];
701 |             cell.quotechar = quotechar_;
702 |             cell.escapechar = escapechar_;
703 |         }
704 |     }
705 | 
706 |     bool
707 |     read_row()
708 |     {
709 |         const char *p;
710 |         CSM_DEBUG("")
711 | 
712 |         do {
713 |             p = stream_.buf();
714 |             p_ = p;
715 |             endp_ = p + stream_.size();
716 |             switch(try_parse()) {
717 |                 case kCsmTryParseOkay:
718 |                     stream_.consume(p_ - p);
719 |                     return true;
720 |                 case kCsmTryParseOverflow:
721 |                     _resize();
722 |                     return read_row();
723 |                 case kCsmTryParseUnderrun:
724 |                     ;
725 |             }
726 |             CSM_DEBUG("attempting fill!")
727 |         } while(stream_.fill());
728 | 
729 |         if(row_.count && yield_incomplete_row_) {
730 |             CSM_DEBUG("stream fill failed, but partial row exists")
731 |             stream_.consume(endp_ - p);
732 |             return true;
733 |         }
734 | 
735 |         CSM_DEBUG("stream fill failed")
736 |         return false;
737 |     }
738 | 
739 |     CsvCursor &
740 |     row()
741 |     {
742 |         return row_;
743 |     }
744 | 
745 |     CsvReader(StreamCursorType &stream,
746 |             char delimiter=',',
747 |             char quotechar='"',
748 |             char escapechar=0,
749 |             bool yield_incomplete_row=false)
750 |         : endp_(stream.buf() + stream.size())
751 |         , p_(stream.buf())
752 |         , delimiter_(delimiter)
753 |         , quotechar_(quotechar)
754 |         , escapechar_(escapechar)
755 |         , yield_incomplete_row_(yield_incomplete_row)
756 |         , stream_(stream)
757 |         , quoted_cell_spanner_(quotechar, escapechar)
758 |         , unquoted_cell_spanner_(delimiter, '\r', '\n', escapechar)
759 |     {
760 |         _resize();
761 |     }
762 | };
763 | 
764 | 
765 | } // namespace csvmonkey
766 | 


--------------------------------------------------------------------------------
/scripts/calc.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | import os.path
4 | import sys
5 | 
6 | print (os.path.getsize('ram.csv') / 1048576.0) / (float(sys.argv[1]) / 1e6), 'GiB/s'
7 | 


--------------------------------------------------------------------------------
/scripts/compare.py:
--------------------------------------------------------------------------------
 1 | 
 2 | try:
 3 |     from itertools import izip
 4 | except ImportError:
 5 |     izip = zip
 6 | 
 7 | import csv
 8 | import csvmonkey
 9 | 
10 | 
11 | icsv = csv.reader(open('ram.csv'))
12 | next(icsv)
13 | 
14 | imonkey = csvmonkey.from_path('ram.csv', yields='list', header=True)
15 | 
16 | for r1, r2 in izip(icsv, imonkey):
17 |     print(r1)
18 |     print(r2)
19 |     assert r1 == r2
20 | 


--------------------------------------------------------------------------------
/scripts/csvcut.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import csv
 5 | import operator
 6 | import sys
 7 | from itertools import chain
 8 | 
 9 | import csvmonkey
10 | 
11 | 
12 | parser = argparse.ArgumentParser(description='Process some integers.')
13 | parser.add_argument('paths', nargs='*')
14 | parser.add_argument('-H', '--no-header', action='store_true', default=False)
15 | parser.add_argument('-f', '--fields', default='-')
16 | 
17 | args = parser.parse_args()
18 | 
19 | if args.fields == '-':
20 |     slices = [slice(None)]
21 | else:
22 |     slices = []
23 |     for bit in args.fields.split(','):
24 |         left, sep, right = bit.partition('-')
25 |         if sep:
26 |             slices.append(slice(int(left) - 1, int(right)))
27 |         else:
28 |             slices.append(slice(int(left) -1, int(left)))
29 | 
30 | ig = operator.itemgetter(*slices)
31 | 
32 | 
33 | writer = csv.writer(sys.stdout, quoting=csv.QUOTE_ALL)
34 | 
35 | readers = [
36 |     csvmonkey.from_path(path, header=not args.no_header, yields='tuple')
37 |     for path in args.paths
38 | ]
39 | if not readers:
40 |     it = iter(sys.stdin.readline, '')
41 |     readers.append(csvmonkey.from_iter(it, header=not args.no_header, yields='tuple'))
42 | 
43 | 
44 | for reader in readers:
45 |     for row in reader:
46 |         l = []
47 |         for sl in slices:
48 |             l.extend(row[sl])
49 |         writer.writerow(l)
50 | 


--------------------------------------------------------------------------------
/scripts/dequote.py:
--------------------------------------------------------------------------------
1 | 
2 | import csv
3 | 
4 | writer = csv.writer(file('ram.noquotes-64mb.csv', 'w'), quoting=csv.QUOTE_NONE)
5 | for row in csv.reader(file('ram.64mb.csv')):
6 |     writer.writerow([c.replace(',', '') for c in row])
7 | 


--------------------------------------------------------------------------------
/scripts/makesum.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import hashlib
 3 | import csv
 4 | 
 5 | import csvmonkey
 6 | 
 7 | reader = csv.reader(open('ram.csv'))
 8 | h = hashlib.sha256()
 9 | 
10 | for row in reader:
11 |     for col in row:
12 |         h.update(col)
13 | 
14 | assert h.hexdigest() == (
15 |     "68187f51a11392551209d440710d835cdc167e2150eccb34e8cf9192bb8f9fc6"
16 | )
17 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from __future__ import print_function
 3 | import os
 4 | import sys
 5 | 
 6 | from setuptools import Extension
 7 | from setuptools import setup
 8 | 
 9 | sys.path.insert(0,
10 |     os.path.join(
11 |         os.path.dirname(__file__),
12 |         'third_party',
13 |     )
14 | )
15 | 
16 | import cpuid
17 | 
18 | 
19 | def has_sse42():
20 |     cpu = cpuid.CPUID()
21 |     regs = cpu(1)
22 |     return bool((1 << 20) & regs[2])
23 | 
24 | 
25 | extra_compile_args = []
26 | extra_compile_args += ['-std=c++11']
27 | extra_compile_args += ['-Iinclude']
28 | extra_compile_args += ['-O3']
29 | extra_compile_args += ['-w']
30 | 
31 | # cc1plus: warning: command line option '-Wstrict-prototypes' is valid for
32 | # C/ObjC but not for C++
33 | extra_compile_args += ['-Wno-strict-prototypes']
34 | 
35 | #extra_compile_args += ['-DUSE_SPIRIT']
36 | #extra_compile_args += ['-I/home/dmw/src/boost_1_64_0']
37 | #extra_compile_args += ['-fprofile-generate', '-lgcov']
38 | #extra_compile_args += ['-DCSVMONKEY_DEBUG']
39 | 
40 | 
41 | if has_sse42():
42 |     extra_compile_args += ['-msse4.2']
43 | else:
44 |     print("Warning: CPU lacks SSE4.2, compiling with fallback",
45 |           file=sys.stderr)
46 | 
47 | 
48 | setup(
49 |     name='csvmonkey',
50 |     author='David Wilson',
51 |     author_email='dw+csvmonkey@botanicus.net',
52 |     version='0.0.5',
53 |     classifiers=[],
54 |     url='https://github.com/dw/csvmonkey/',
55 |     ext_modules = [
56 |         Extension(
57 |             name='csvmonkey',
58 |             sources=['cpython/csvmonkey.cpp'],
59 |             undef_macros=['NDEBUG'],
60 |             extra_compile_args=extra_compile_args,
61 |         )
62 |     ],
63 |     zip_safe = False,
64 | )
65 | 


--------------------------------------------------------------------------------
/tests/.gitignore:
--------------------------------------------------------------------------------
1 | CMakeCache.txt
2 | CMakeFiles
3 | Makefile
4 | cmake_install.cmake
5 | main
6 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.11)
 2 | 
 3 | #SET_SOURCE_FILES_PROPERTIES( nosse_stringspanner_test.cpp PROPERTIES COMPILE_FLAGS -Wderp )
 4 | 
 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wunused -msse4.2")
 6 | 
 7 | 
 8 | add_executable(main
 9 |     main.cpp
10 |     sse42_stringspanner_test.cpp
11 |     fallback_stringspanner_test.cpp
12 | )
13 | 
14 | set_property(TARGET main PROPERTY CXX_STANDARD 11)
15 | 


--------------------------------------------------------------------------------
/tests/_stringspanner_test.cpp:
--------------------------------------------------------------------------------
 1 | #include <string>
 2 | 
 3 | #include "catch.hpp"
 4 | #include "csvmonkey.hpp"
 5 | 
 6 | 
 7 | TEST_CASE(PREFIX "initialNullTerminates", "[stringspanner]")
 8 | {
 9 |     // PCMPISTRI returns 16 to indicate null encountered.
10 |     const char *x = "\x00this,should,never,be,reached";
11 |     csvmonkey::StringSpanner ss(',');
12 |     REQUIRE(ss(x) == 16);
13 | }
14 | 
15 | 
16 | TEST_CASE(PREFIX "midNullTerminates", "[stringspanner]")
17 | {
18 |     // PCMPISTRI returns 16 to indicate null encountered.
19 |     const char *x = "derp\x00this,should,never,be,reached";
20 |     csvmonkey::StringSpanner ss(',');
21 |     REQUIRE(ss(x) == 16);
22 | }
23 | 
24 | 
25 | TEST_CASE(PREFIX "noMatchTerminates0", "[stringspanner]")
26 | {
27 |     // Comma not found.
28 |     const char *x = "derpderpderpderpderp";
29 |     csvmonkey::StringSpanner ss(',');
30 |     REQUIRE(ss(x) == 16);
31 | }
32 | 
33 | 
34 | TEST_CASE(PREFIX "noMatchTerminates1", "[stringspanner]")
35 | {
36 |     // No terminator specified.
37 |     const char *x = "derpderpderpderpderp";
38 |     csvmonkey::StringSpanner ss;
39 |     REQUIRE(ss(x) == 16);
40 | }
41 | 
42 | 
43 | TEST_CASE(PREFIX "matchAtEachOffset", "[stringspanner]")
44 | {
45 |     const char *x = "derpderpderpderpderp";
46 |     for(int i = 0; i < 16; i++) {
47 |         std::string s(x);
48 |         s[i] = ',';
49 |         INFO("i = " << i);
50 |         csvmonkey::StringSpanner ss(',');
51 |         REQUIRE(ss(s.c_str()) == i);
52 |     }
53 | }
54 | 
55 | 
56 | TEST_CASE(PREFIX "matchPos16", "[stringspanner]")
57 | {
58 |     const char *x = "derpderpderpderpderp";
59 |     std::string s(x);
60 |     s[16] = ',';
61 |     csvmonkey::StringSpanner ss(',');
62 |     REQUIRE(ss(s.c_str()) == 16);
63 | }
64 | 
65 | 
66 | TEST_CASE(PREFIX "matchPos17", "[stringspanner]")
67 | {
68 |     const char *x = "derpderpderpderpderp";
69 |     std::string s(x);
70 |     s[17] = ',';
71 |     csvmonkey::StringSpanner ss(',');
72 |     REQUIRE(ss(s.c_str()) == 16);
73 | }
74 | 


--------------------------------------------------------------------------------
/tests/bench/iteration.cpp:
--------------------------------------------------------------------------------
 1 | #include <sys/types.h>
 2 | #include <sys/stat.h>
 3 | #include <unistd.h>
 4 | #include <chrono>
 5 | 
 6 | #include "csvmonkey.hpp"
 7 | 
 8 | using csvmonkey::CsvCell;
 9 | using csvmonkey::CsvCursor;
10 | using csvmonkey::CsvReader;
11 | using csvmonkey::MappedFileCursor;
12 | using std::chrono::duration_cast;
13 | using std::chrono::high_resolution_clock;
14 | using std::chrono::microseconds;
15 | 
16 | 
17 | static void
18 | die(const char *msg)
19 | {
20 |     fprintf(stderr, "%s\n", msg);
21 |     exit(1);
22 | }
23 | 
24 | 
25 | static int
26 | go(const char *path)
27 | {
28 |     MappedFileCursor stream;
29 |     CsvReader<MappedFileCursor> reader(stream);
30 | 
31 |     stream.open(path);
32 |     CsvCursor &row = reader.row();
33 |     if(! reader.read_row()) {
34 |         die("Cannot read header row");
35 |     }
36 | 
37 |     CsvCell *cost_cell;
38 |     if((! row.by_value("Cost", cost_cell)) &&
39 |        (! row.by_value("UnBlendedCost", cost_cell))) {
40 |         die("Cannot find Cost column");
41 |     }
42 | 
43 |     CsvCell *resource_id_cell;
44 |     if(! row.by_value("ResourceId", resource_id_cell)) {
45 |         die("Cannot find ResourceId column");
46 |     }
47 | 
48 |     CsvCell *record_type_cell;
49 |     if(! row.by_value("RecordType", record_type_cell)) {
50 |         die("Cannot find RecordType column");
51 |     }
52 | 
53 |     auto now = [&] { return high_resolution_clock::now(); };
54 |     double total = 0.0;
55 |     auto start = now();
56 | 
57 |     while(reader.read_row()) {
58 |         if(0) {
59 |             if(record_type_cell->equals("LineItem")) {
60 |                 total += cost_cell->as_double();
61 |             } else if(record_type_cell->equals("Rounding")) {
62 |                 total += cost_cell->as_double();
63 |             }
64 |         }
65 |     }
66 |     auto finish = now();
67 | 
68 |     printf("Total cost: %lf\n", total);
69 |     auto usec = duration_cast<microseconds>(finish - start).count();
70 | 
71 |     struct stat st;
72 |     stat(path, &st);
73 | 
74 |     std::cout << usec << " us\n";
75 |     std::cout << (st.st_size / usec) << " bytes/us\n";
76 |     std::cout << (
77 |         (1e6 / (1024.0 * 1048576.0)) * (double) (st.st_size / usec) 
78 |     ) << " GiB/s\n";
79 |     return 0;
80 | }
81 | 
82 | 
83 | int main(int argc, char **argv)
84 | {
85 |     const char *path = "ram.csv";
86 |     if(argc > 1) {
87 |         path = argv[1];
88 |     }
89 |     for(int i = 0 ; i < 5; i++) {
90 |         go(path);
91 |     }
92 | }
93 | 


--------------------------------------------------------------------------------
/tests/csvmonkey_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import unittest
 3 | 
 4 | import csvmonkey
 5 | 
 6 | 
 7 | EXAMPLE_FILE = """
 8 | c0,c1,c2,c3
 9 | 0,1,2,3
10 | a,b,c,d
11 | """
12 | 
13 | 
14 | def make_reader(s, **kwargs):
15 |     return csvmonkey.from_iter(iter([s]), **kwargs)
16 | 
17 | 
18 | class ParseTest(unittest.TestCase):
19 |     def test_bad_split(self):
20 |         s = "2017-05-01T02:15:08.000Z 2 229340663981 eni-00589050 172.31.11.238 138.246.253.19 443 54503 6 1 44 1493604908 1493604966 ACCEPT OK\n"
21 |         reader = make_reader(s, delimiter=' ', header=False)
22 |         self.assertEquals(next(reader).astuple(), tuple(s.split()))
23 | 
24 | 
25 | 
26 | class RowTest(unittest.TestCase):
27 |     def reader(self):
28 |         return make_reader(EXAMPLE_FILE)
29 | 
30 |     def test_getitem_numeric_positive(self):
31 |         reader = self.reader()
32 |         row = next(reader)
33 |         self.assertEquals("0", row[0])
34 |         self.assertRaises(IndexError, lambda: row[5])
35 | 
36 |     def test_getitem_numeric_negative(self):
37 |         reader = self.reader()
38 |         row = next(reader)
39 |         self.assertEquals("3", row[-1])
40 |         self.assertRaises(IndexError, lambda: row[-5])
41 | 
42 |     def test_getitem_key(self):
43 |         reader = self.reader()
44 |         row = next(reader)
45 |         self.assertEquals("0", row["c0"])
46 |         self.assertRaises(KeyError, lambda: row["missing"])
47 | 
48 | 
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     unittest.main()
53 | 


--------------------------------------------------------------------------------
/tests/data/anon-ram.csv.zstd:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8f94ae436de5dc0a8033f7fdd2c7348e538546a6799a20ea6a62705d220f2687
3 | size 2117190
4 | 


--------------------------------------------------------------------------------
/tests/fallback_stringspanner_test.cpp:
--------------------------------------------------------------------------------
1 | #define CSM_IGNORE_SSE42
2 | #define PREFIX "fallback_stringspanner_"
3 | #include "_stringspanner_test.cpp"
4 | 


--------------------------------------------------------------------------------
/tests/fullsum.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | 
 3 | #include "csvmonkey.hpp"
 4 | #include "picosha2.h"
 5 | 
 6 | 
 7 | using namespace csvmonkey;
 8 | 
 9 | 
10 | int main(int argc, char **argv)
11 | {
12 |     const char *path = "ram.csv";
13 |     if(argc > 1) {
14 |         path = argv[1];
15 |     }
16 | 
17 |     MappedFileCursor stream;
18 |     stream.open(path);
19 |     CsvReader<MappedFileCursor> reader(stream);
20 |     CsvCursor &row = reader.row();
21 | 
22 |     picosha2::hash256_one_by_one hasher;
23 |     while(reader.read_row()) {
24 |         for(size_t i = 0; i < row.count; i++) {
25 |             CsvCell &cell = row.cells[i];
26 |             std::string s = cell.as_str();
27 |             hasher.process(s.begin(), s.end());
28 |         }
29 |     }
30 | 
31 |     hasher.finish();
32 |     std::cout << picosha2::get_hash_hex_string(hasher) << "\n";
33 |     return 0;
34 | }
35 | 


--------------------------------------------------------------------------------
/tests/main.cpp:
--------------------------------------------------------------------------------
1 | #define CATCH_CONFIG_MAIN
2 | #include "catch.hpp"
3 | 


--------------------------------------------------------------------------------
/tests/parser_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import unittest
 3 | import io
 4 | 
 5 | import csvmonkey
 6 | 
 7 | 
 8 | def parse(s):
 9 |     return list(
10 |         csvmonkey.from_file(
11 |             io.BytesIO(s),
12 |             header=False,
13 |             yields='tuple'
14 |         )
15 |     )
16 | 
17 | 
18 | class BoundaryTest(unittest.TestCase):
19 |     def test_4094(self):
20 |         # parsing ends on page boundary
21 |         c = 'x' * 4094
22 |         s = '%s,\n' % (c,)
23 |         self.assertEquals([(c,'')], parse(s))
24 | 
25 |     def test_4095(self):
26 |         # parsing ends on first byte of new page
27 |         c = 'x' * 4095
28 |         s = '%s,\n' % (c,)
29 |         self.assertEquals([(c,'')], parse(s))
30 | 
31 |     def test_14(self):
32 |         # parsing ends on 16th byte (SSE4.2)
33 |         c = 'x' * 14
34 |         s = '%s,\n' % (c,)
35 |         self.assertEquals([(c,'')], parse(s))
36 | 
37 |     def test_15(self):
38 |         # parsing ends on 17th byte (SSE4.2)
39 |         c = 'x' * 15
40 |         s = '%s,\n' % (c,)
41 |         self.assertEquals([(c,'')], parse(s))
42 | 
43 | 
44 | class Test(unittest.TestCase):
45 |     def test_empty0(self):
46 |         self.assertEquals([], parse(''))
47 | 
48 |     def test_empty1(self):
49 |         self.assertEquals([], parse('\n'))
50 | 
51 |     def test_empty2(self):
52 |         self.assertEquals([], parse('\r\n'))
53 | 
54 |     def test_empty2(self):
55 |         self.assertEquals([], parse('\r\n\n\r\r\r\n'))
56 | 
57 |     def test_unquoted_noeol(self):
58 |         self.assertEquals([('a', 'b')], parse('a,b'))
59 | 
60 |     def test_unquoted_noeol2(self):
61 |         self.skipTest('failing')
62 |         self.assertEquals([('a', 'b'), ('c', 'd')], parse('a,b\n\rc,d'))
63 | 
64 |     def test_unquoted(self):
65 |         self.assertEquals([('a', 'b')], parse('a,b\n'))
66 | 
67 |     def test_quoted_empty(self):
68 |         self.assertEquals([('',)], parse('""\n'))
69 | 
70 |     def test_quoted_empty_unquoted(self):
71 |         self.skipTest('failing')
72 |         self.assertEquals([('', '')], parse('"",\n'))
73 | 
74 |     def test_unquoted_empty(self):
75 |         self.skipTest('failing')
76 |         self.assertEquals([('', '')], parse(',\n'))
77 | 
78 | if __name__ == '__main__':
79 |     unittest.main()
80 | 


--------------------------------------------------------------------------------
/tests/sse42_stringspanner_test.cpp:
--------------------------------------------------------------------------------
1 | #define PREFIX "sse42_stringspanner_"
2 | #include "_stringspanner_test.cpp"
3 | 


--------------------------------------------------------------------------------
/third_party/cpuid.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | #     Copyright (c) 2014 Anders Høst
  4 | #
  5 | 
  6 | from __future__ import print_function
  7 | 
  8 | import platform
  9 | import os
 10 | import sys
 11 | import ctypes
 12 | from ctypes import c_uint32, c_int, c_size_t, c_void_p, POINTER, CFUNCTYPE
 13 | 
 14 | # Posix x86_64:
 15 | # Two first call registers : RDI, RSI
 16 | # Volatile registers       : RAX, RCX, RDX, RSI, RDI, R8-11
 17 | 
 18 | # Windows x86_64:
 19 | # Two first call registers : RCX, RDX
 20 | # Volatile registers       : RAX, RCX, RDX, R8-11
 21 | 
 22 | # cdecl 32 bit:
 23 | # Two first call registers : Stack (%esp)
 24 | # Volatile registers       : EAX, ECX, EDX
 25 | 
 26 | _POSIX_64_OPC = [
 27 |         0x53,                    # push   %rbx
 28 |         0x48, 0x89, 0xf0,        # mov    %rsi,%rax
 29 |         0x31, 0xc9,              # xor    %ecx,%ecx
 30 |         0x0f, 0xa2,              # cpuid
 31 |         0x89, 0x07,              # mov    %eax,(%rdi)
 32 |         0x89, 0x5f, 0x04,        # mov    %ebx,0x4(%rdi)
 33 |         0x89, 0x4f, 0x08,        # mov    %ecx,0x8(%rdi)
 34 |         0x89, 0x57, 0x0c,        # mov    %edx,0xc(%rdi)
 35 |         0x5b,                    # pop    %rbx
 36 |         0xc3                     # retq
 37 | ]
 38 | 
 39 | _WINDOWS_64_OPC = [
 40 |         0x53,                    # push   %rbx
 41 |         0x48, 0x89, 0xd0,        # mov    %rdx,%rax
 42 |         0x49, 0x89, 0xc8,        # mov    %rcx, %r8
 43 |         0x31, 0xc9,              # xor    %ecx,%ecx
 44 |         0x0f, 0xa2,              # cpuid
 45 |         0x41, 0x89, 0x00,        # mov    %eax,(%r8)
 46 |         0x41, 0x89, 0x58, 0x04,  # mov    %ebx,0x4(%r8)
 47 |         0x41, 0x89, 0x48, 0x08,  # mov    %ecx,0x8(%r8)
 48 |         0x41, 0x89, 0x50, 0x0c,  # mov    %edx,0xc(%r8)
 49 |         0x5b,                    # pop    %rbx
 50 |         0xc3                     # retq
 51 | ]
 52 | 
 53 | _CDECL_32_OPC = [
 54 |         0x53,                    # push   %ebx
 55 |         0x57,                    # push   %edi
 56 |         0x8b, 0x7c, 0x24, 0x0c,  # mov    0xc(%esp),%edi
 57 |         0x8b, 0x44, 0x24, 0x10,  # mov    0x10(%esp),%eax
 58 |         0x31, 0xc9,              # xor    %ecx,%ecx
 59 |         0x0f, 0xa2,              # cpuid
 60 |         0x89, 0x07,              # mov    %eax,(%edi)
 61 |         0x89, 0x5f, 0x04,        # mov    %ebx,0x4(%edi)
 62 |         0x89, 0x4f, 0x08,        # mov    %ecx,0x8(%edi)
 63 |         0x89, 0x57, 0x0c,        # mov    %edx,0xc(%edi)
 64 |         0x5f,                    # pop    %edi
 65 |         0x5b,                    # pop    %ebx
 66 |         0xc3                     # ret
 67 | ]
 68 | 
 69 | is_windows = os.name == "nt" or sys.platform == "cygwin"
 70 | is_64bit   = ctypes.sizeof(ctypes.c_voidp) == 8
 71 | 
 72 | class CPUID_struct(ctypes.Structure):
 73 |     _fields_ = [(r, c_uint32) for r in ("eax", "ebx", "ecx", "edx")]
 74 | 
 75 | class CPUID(object):
 76 |     def __init__(self):
 77 |         if platform.machine() not in ("AMD64", "x86_64", "x86", "i686"):
 78 |             raise SystemError("Only available for x86")
 79 |         
 80 |         if is_windows:
 81 |             if is_64bit:
 82 |                 # VirtualAlloc seems to fail under some weird
 83 |                 # circumstances when ctypes.windll.kernel32 is
 84 |                 # used under 64 bit Python. CDLL fixes this.
 85 |                 self.win = ctypes.CDLL("kernel32.dll")
 86 |                 opc = _WINDOWS_64_OPC
 87 |             else:
 88 |                 # Here ctypes.windll.kernel32 is needed to get the
 89 |                 # right DLL. Otherwise it will fail when running
 90 |                 # 32 bit Python on 64 bit Windows.
 91 |                 self.win = ctypes.windll.kernel32
 92 |                 opc = _CDECL_32_OPC
 93 |         else:
 94 |             opc = _POSIX_64_OPC if is_64bit else _CDECL_32_OPC
 95 | 
 96 |         size = len(opc)
 97 |         code = (ctypes.c_ubyte * size)(*opc)
 98 | 
 99 |         self.r = CPUID_struct()
100 | 
101 |         if is_windows:
102 |             self.addr = self.win.VirtualAlloc(None, size, 0x1000, 0x40)
103 |             if not self.addr:
104 |                 raise MemoryError("Could not allocate RWX memory")
105 |         else:
106 |             self.libc = ctypes.cdll.LoadLibrary(None)
107 |             self.libc.valloc.restype = ctypes.c_void_p
108 |             self.libc.valloc.argtypes = [ctypes.c_size_t]
109 |             self.addr = self.libc.valloc(size)
110 |             if not self.addr:
111 |                 raise MemoryError("Could not allocate memory")
112 | 
113 |             self.libc.mprotect.restype = c_int
114 |             self.libc.mprotect.argtypes = [c_void_p, c_size_t, c_int]
115 |             ret = self.libc.mprotect(self.addr, size, 1 | 2 | 4)
116 |             if ret != 0:
117 |                 raise OSError("Failed to set RWX")
118 | 
119 | 
120 |         ctypes.memmove(self.addr, code, size)
121 | 
122 |         func_type = CFUNCTYPE(None, POINTER(CPUID_struct), c_uint32)
123 |         self.func_ptr = func_type(self.addr)
124 | 
125 |     def __call__(self, eax):
126 |         self.func_ptr(self.r, eax)
127 |         return (self.r.eax, self.r.ebx, self.r.ecx, self.r.edx)
128 | 
129 |     def __del__(self):
130 |         if is_windows:
131 |             self.win.VirtualFree(self.addr, 0, 0x8000)
132 |         elif self.libc:
133 |             # Seems to throw exception when the program ends and
134 |             # libc is cleaned up before the object?
135 |             self.libc.free.restype = None
136 |             self.libc.free.argtypes = [c_void_p]
137 |             self.libc.free(self.addr)
138 | 
139 | if __name__ == "__main__":
140 |     def valid_inputs():
141 |         cpuid = CPUID()
142 |         for eax in (0x0, 0x80000000):
143 |             highest, _, _, _ = cpuid(eax)
144 |             while eax <= highest:
145 |                 regs = cpuid(eax)
146 |                 yield (eax, regs)
147 |                 eax += 1
148 | 
149 |     print(" ".join(x.ljust(8) for x in ("CPUID", "A", "B", "C", "D")).strip())
150 |     for eax, regs in valid_inputs():
151 |         print("%08x" % eax, " ".join("%08x" % reg for reg in regs))
152 | 
153 | 


--------------------------------------------------------------------------------
/third_party/picosha2.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (C) 2017 okdshin
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | #ifndef PICOSHA2_H
 25 | #define PICOSHA2_H
 26 | // picosha2:20140213
 27 | 
 28 | #ifndef PICOSHA2_BUFFER_SIZE_FOR_INPUT_ITERATOR
 29 | #define PICOSHA2_BUFFER_SIZE_FOR_INPUT_ITERATOR \
 30 |     1048576  //=1024*1024: default is 1MB memory
 31 | #endif
 32 | 
 33 | #include <algorithm>
 34 | #include <cassert>
 35 | #include <iterator>
 36 | #include <sstream>
 37 | #include <vector>
 38 | #include <fstream>
 39 | namespace picosha2 {
 40 | typedef unsigned long word_t;
 41 | typedef unsigned char byte_t;
 42 | 
 43 | static const size_t k_digest_size = 32;
 44 | 
 45 | namespace detail {
 46 | inline byte_t mask_8bit(byte_t x) { return x & 0xff; }
 47 | 
 48 | inline word_t mask_32bit(word_t x) { return x & 0xffffffff; }
 49 | 
 50 | const word_t add_constant[64] = {
 51 |     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
 52 |     0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 53 |     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
 54 |     0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 55 |     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
 56 |     0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 57 |     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
 58 |     0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 59 |     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
 60 |     0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 61 |     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2};
 62 | 
 63 | const word_t initial_message_digest[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372,
 64 |                                           0xa54ff53a, 0x510e527f, 0x9b05688c,
 65 |                                           0x1f83d9ab, 0x5be0cd19};
 66 | 
 67 | inline word_t ch(word_t x, word_t y, word_t z) { return (x & y) ^ ((~x) & z); }
 68 | 
 69 | inline word_t maj(word_t x, word_t y, word_t z) {
 70 |     return (x & y) ^ (x & z) ^ (y & z);
 71 | }
 72 | 
 73 | inline word_t rotr(word_t x, std::size_t n) {
 74 |     assert(n < 32);
 75 |     return mask_32bit((x >> n) | (x << (32 - n)));
 76 | }
 77 | 
 78 | inline word_t bsig0(word_t x) { return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22); }
 79 | 
 80 | inline word_t bsig1(word_t x) { return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25); }
 81 | 
 82 | inline word_t shr(word_t x, std::size_t n) {
 83 |     assert(n < 32);
 84 |     return x >> n;
 85 | }
 86 | 
 87 | inline word_t ssig0(word_t x) { return rotr(x, 7) ^ rotr(x, 18) ^ shr(x, 3); }
 88 | 
 89 | inline word_t ssig1(word_t x) { return rotr(x, 17) ^ rotr(x, 19) ^ shr(x, 10); }
 90 | 
 91 | template <typename RaIter1, typename RaIter2>
 92 | void hash256_block(RaIter1 message_digest, RaIter2 first, RaIter2 last) {
 93 |     assert(first + 64 == last);
 94 |     static_cast<void>(last);  // for avoiding unused-variable warning
 95 |     word_t w[64];
 96 |     std::fill(w, w + 64, 0);
 97 |     for (std::size_t i = 0; i < 16; ++i) {
 98 |         w[i] = (static_cast<word_t>(mask_8bit(*(first + i * 4))) << 24) |
 99 |                (static_cast<word_t>(mask_8bit(*(first + i * 4 + 1))) << 16) |
100 |                (static_cast<word_t>(mask_8bit(*(first + i * 4 + 2))) << 8) |
101 |                (static_cast<word_t>(mask_8bit(*(first + i * 4 + 3))));
102 |     }
103 |     for (std::size_t i = 16; i < 64; ++i) {
104 |         w[i] = mask_32bit(ssig1(w[i - 2]) + w[i - 7] + ssig0(w[i - 15]) +
105 |                           w[i - 16]);
106 |     }
107 | 
108 |     word_t a = *message_digest;
109 |     word_t b = *(message_digest + 1);
110 |     word_t c = *(message_digest + 2);
111 |     word_t d = *(message_digest + 3);
112 |     word_t e = *(message_digest + 4);
113 |     word_t f = *(message_digest + 5);
114 |     word_t g = *(message_digest + 6);
115 |     word_t h = *(message_digest + 7);
116 | 
117 |     for (std::size_t i = 0; i < 64; ++i) {
118 |         word_t temp1 = h + bsig1(e) + ch(e, f, g) + add_constant[i] + w[i];
119 |         word_t temp2 = bsig0(a) + maj(a, b, c);
120 |         h = g;
121 |         g = f;
122 |         f = e;
123 |         e = mask_32bit(d + temp1);
124 |         d = c;
125 |         c = b;
126 |         b = a;
127 |         a = mask_32bit(temp1 + temp2);
128 |     }
129 |     *message_digest += a;
130 |     *(message_digest + 1) += b;
131 |     *(message_digest + 2) += c;
132 |     *(message_digest + 3) += d;
133 |     *(message_digest + 4) += e;
134 |     *(message_digest + 5) += f;
135 |     *(message_digest + 6) += g;
136 |     *(message_digest + 7) += h;
137 |     for (std::size_t i = 0; i < 8; ++i) {
138 |         *(message_digest + i) = mask_32bit(*(message_digest + i));
139 |     }
140 | }
141 | 
142 | }  // namespace detail
143 | 
144 | template <typename InIter>
145 | void output_hex(InIter first, InIter last, std::ostream& os) {
146 |     os.setf(std::ios::hex, std::ios::basefield);
147 |     while (first != last) {
148 |         os.width(2);
149 |         os.fill('0');
150 |         os << static_cast<unsigned int>(*first);
151 |         ++first;
152 |     }
153 |     os.setf(std::ios::dec, std::ios::basefield);
154 | }
155 | 
156 | template <typename InIter>
157 | void bytes_to_hex_string(InIter first, InIter last, std::string& hex_str) {
158 |     std::ostringstream oss;
159 |     output_hex(first, last, oss);
160 |     hex_str.assign(oss.str());
161 | }
162 | 
163 | template <typename InContainer>
164 | void bytes_to_hex_string(const InContainer& bytes, std::string& hex_str) {
165 |     bytes_to_hex_string(bytes.begin(), bytes.end(), hex_str);
166 | }
167 | 
168 | template <typename InIter>
169 | std::string bytes_to_hex_string(InIter first, InIter last) {
170 |     std::string hex_str;
171 |     bytes_to_hex_string(first, last, hex_str);
172 |     return hex_str;
173 | }
174 | 
175 | template <typename InContainer>
176 | std::string bytes_to_hex_string(const InContainer& bytes) {
177 |     std::string hex_str;
178 |     bytes_to_hex_string(bytes, hex_str);
179 |     return hex_str;
180 | }
181 | 
182 | class hash256_one_by_one {
183 |    public:
184 |     hash256_one_by_one() { init(); }
185 | 
186 |     void init() {
187 |         buffer_.clear();
188 |         std::fill(data_length_digits_, data_length_digits_ + 4, 0);
189 |         std::copy(detail::initial_message_digest,
190 |                   detail::initial_message_digest + 8, h_);
191 |     }
192 | 
193 |     template <typename RaIter>
194 |     void process(RaIter first, RaIter last) {
195 |         add_to_data_length(static_cast<word_t>(std::distance(first, last)));
196 |         std::copy(first, last, std::back_inserter(buffer_));
197 |         std::size_t i = 0;
198 |         for (; i + 64 <= buffer_.size(); i += 64) {
199 |             detail::hash256_block(h_, buffer_.begin() + i,
200 |                                   buffer_.begin() + i + 64);
201 |         }
202 |         buffer_.erase(buffer_.begin(), buffer_.begin() + i);
203 |     }
204 | 
205 |     void finish() {
206 |         byte_t temp[64];
207 |         std::fill(temp, temp + 64, 0);
208 |         std::size_t remains = buffer_.size();
209 |         std::copy(buffer_.begin(), buffer_.end(), temp);
210 |         temp[remains] = 0x80;
211 | 
212 |         if (remains > 55) {
213 |             std::fill(temp + remains + 1, temp + 64, 0);
214 |             detail::hash256_block(h_, temp, temp + 64);
215 |             std::fill(temp, temp + 64 - 4, 0);
216 |         } else {
217 |             std::fill(temp + remains + 1, temp + 64 - 4, 0);
218 |         }
219 | 
220 |         write_data_bit_length(&(temp[56]));
221 |         detail::hash256_block(h_, temp, temp + 64);
222 |     }
223 | 
224 |     template <typename OutIter>
225 |     void get_hash_bytes(OutIter first, OutIter last) const {
226 |         for (const word_t* iter = h_; iter != h_ + 8; ++iter) {
227 |             for (std::size_t i = 0; i < 4 && first != last; ++i) {
228 |                 *(first++) = detail::mask_8bit(
229 |                     static_cast<byte_t>((*iter >> (24 - 8 * i))));
230 |             }
231 |         }
232 |     }
233 | 
234 |    private:
235 |     void add_to_data_length(word_t n) {
236 |         word_t carry = 0;
237 |         data_length_digits_[0] += n;
238 |         for (std::size_t i = 0; i < 4; ++i) {
239 |             data_length_digits_[i] += carry;
240 |             if (data_length_digits_[i] >= 65536u) {
241 |                 carry = data_length_digits_[i] >> 16;
242 |                 data_length_digits_[i] &= 65535u;
243 |             } else {
244 |                 break;
245 |             }
246 |         }
247 |     }
248 |     void write_data_bit_length(byte_t* begin) {
249 |         word_t data_bit_length_digits[4];
250 |         std::copy(data_length_digits_, data_length_digits_ + 4,
251 |                   data_bit_length_digits);
252 | 
253 |         // convert byte length to bit length (multiply 8 or shift 3 times left)
254 |         word_t carry = 0;
255 |         for (std::size_t i = 0; i < 4; ++i) {
256 |             word_t before_val = data_bit_length_digits[i];
257 |             data_bit_length_digits[i] <<= 3;
258 |             data_bit_length_digits[i] |= carry;
259 |             data_bit_length_digits[i] &= 65535u;
260 |             carry = (before_val >> (16 - 3)) & 65535u;
261 |         }
262 | 
263 |         // write data_bit_length
264 |         for (int i = 3; i >= 0; --i) {
265 |             (*begin++) = static_cast<byte_t>(data_bit_length_digits[i] >> 8);
266 |             (*begin++) = static_cast<byte_t>(data_bit_length_digits[i]);
267 |         }
268 |     }
269 |     std::vector<byte_t> buffer_;
270 |     word_t data_length_digits_[4];  // as 64bit integer (16bit x 4 integer)
271 |     word_t h_[8];
272 | };
273 | 
274 | inline void get_hash_hex_string(const hash256_one_by_one& hasher,
275 |                                 std::string& hex_str) {
276 |     byte_t hash[k_digest_size];
277 |     hasher.get_hash_bytes(hash, hash + k_digest_size);
278 |     return bytes_to_hex_string(hash, hash + k_digest_size, hex_str);
279 | }
280 | 
281 | inline std::string get_hash_hex_string(const hash256_one_by_one& hasher) {
282 |     std::string hex_str;
283 |     get_hash_hex_string(hasher, hex_str);
284 |     return hex_str;
285 | }
286 | 
287 | namespace impl {
288 | template <typename RaIter, typename OutIter>
289 | void hash256_impl(RaIter first, RaIter last, OutIter first2, OutIter last2, int,
290 |                   std::random_access_iterator_tag) {
291 |     hash256_one_by_one hasher;
292 |     // hasher.init();
293 |     hasher.process(first, last);
294 |     hasher.finish();
295 |     hasher.get_hash_bytes(first2, last2);
296 | }
297 | 
298 | template <typename InputIter, typename OutIter>
299 | void hash256_impl(InputIter first, InputIter last, OutIter first2,
300 |                   OutIter last2, int buffer_size, std::input_iterator_tag) {
301 |     std::vector<byte_t> buffer(buffer_size);
302 |     hash256_one_by_one hasher;
303 |     // hasher.init();
304 |     while (first != last) {
305 |         int size = buffer_size;
306 |         for (int i = 0; i != buffer_size; ++i, ++first) {
307 |             if (first == last) {
308 |                 size = i;
309 |                 break;
310 |             }
311 |             buffer[i] = *first;
312 |         }
313 |         hasher.process(buffer.begin(), buffer.begin() + size);
314 |     }
315 |     hasher.finish();
316 |     hasher.get_hash_bytes(first2, last2);
317 | }
318 | }
319 | 
320 | template <typename InIter, typename OutIter>
321 | void hash256(InIter first, InIter last, OutIter first2, OutIter last2,
322 |              int buffer_size = PICOSHA2_BUFFER_SIZE_FOR_INPUT_ITERATOR) {
323 |     picosha2::impl::hash256_impl(
324 |         first, last, first2, last2, buffer_size,
325 |         typename std::iterator_traits<InIter>::iterator_category());
326 | }
327 | 
328 | template <typename InIter, typename OutContainer>
329 | void hash256(InIter first, InIter last, OutContainer& dst) {
330 |     hash256(first, last, dst.begin(), dst.end());
331 | }
332 | 
333 | template <typename InContainer, typename OutIter>
334 | void hash256(const InContainer& src, OutIter first, OutIter last) {
335 |     hash256(src.begin(), src.end(), first, last);
336 | }
337 | 
338 | template <typename InContainer, typename OutContainer>
339 | void hash256(const InContainer& src, OutContainer& dst) {
340 |     hash256(src.begin(), src.end(), dst.begin(), dst.end());
341 | }
342 | 
343 | template <typename InIter>
344 | void hash256_hex_string(InIter first, InIter last, std::string& hex_str) {
345 |     byte_t hashed[k_digest_size];
346 |     hash256(first, last, hashed, hashed + k_digest_size);
347 |     std::ostringstream oss;
348 |     output_hex(hashed, hashed + k_digest_size, oss);
349 |     hex_str.assign(oss.str());
350 | }
351 | 
352 | template <typename InIter>
353 | std::string hash256_hex_string(InIter first, InIter last) {
354 |     std::string hex_str;
355 |     hash256_hex_string(first, last, hex_str);
356 |     return hex_str;
357 | }
358 | 
359 | inline void hash256_hex_string(const std::string& src, std::string& hex_str) {
360 |     hash256_hex_string(src.begin(), src.end(), hex_str);
361 | }
362 | 
363 | template <typename InContainer>
364 | void hash256_hex_string(const InContainer& src, std::string& hex_str) {
365 |     hash256_hex_string(src.begin(), src.end(), hex_str);
366 | }
367 | 
368 | template <typename InContainer>
369 | std::string hash256_hex_string(const InContainer& src) {
370 |     return hash256_hex_string(src.begin(), src.end());
371 | }
372 | template<typename OutIter>void hash256(std::ifstream& f, OutIter first, OutIter last){
373 |     hash256(std::istreambuf_iterator<char>(f), std::istreambuf_iterator<char>(), first,last);
374 | 
375 | }
376 | }// namespace picosha2
377 | #endif  // PICOSHA2_H
378 | 


--------------------------------------------------------------------------------