├── .gitattributes ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── cpython ├── csvmonkey.cpp ├── file_stream_cursor.hpp └── iterator_stream_cursor.hpp ├── docs ├── Makefile ├── conf.py ├── cpp.rst ├── index.rst ├── python.rst ├── static │ └── .empty └── templates │ ├── github.html │ ├── globaltoc.html │ ├── layout.html │ └── piwik-config.js ├── include └── csvmonkey.hpp ├── scripts ├── calc.py ├── compare.py ├── csvcut.py ├── dequote.py └── makesum.py ├── setup.py ├── tests ├── .gitignore ├── CMakeLists.txt ├── _stringspanner_test.cpp ├── bench │ └── iteration.cpp ├── catch.hpp ├── csvmonkey_test.py ├── data │ ├── anon-ram.csv.zstd │ └── profiledata.csv ├── fallback_stringspanner_test.cpp ├── fullsum.cpp ├── main.cpp ├── parser_test.py └── sse42_stringspanner_test.cpp └── third_party ├── cpuid.py └── picosha2.h /.gitattributes: -------------------------------------------------------------------------------- 1 | **.zstd filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ram.* 2 | perf.data* 3 | **.*.sw[op] 4 | tests/bench/iteration 5 | *.pyc 6 | build 7 | **.so 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2017, David Wilson 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, this 7 | list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | 3. Neither the name of the copyright holder nor the names of its contributors 14 | may be used to endorse or promote products derived from this software without 15 | specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft third_party 2 | graft cpython 3 | graft include 4 | include LICENSE 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | CXXFLAGS += -Iinclude 3 | CXXFLAGS += -Ithird_party 4 | 5 | CXXFLAGS += -std=c++11 6 | CXXFLAGS += -O3 7 | CXXFLAGS += -Wall 8 | CXXFLAGS += -lc 9 | #CXXFLAGS += -DUSE_SPIRIT 10 | 11 | default: debug python 12 | 13 | python: 14 | rm -rf build 15 | python setup.py build_ext --inplace 16 | 17 | debug: CXXFLAGS+=-O0 -g 18 | debug: tests/bench/iteration 19 | 20 | release: X=-DNDEBUG 21 | release: tests/bench/iteration tests/fullsum 22 | 23 | tests/bench/iteration: tests/bench/iteration.cpp include/csvmonkey.hpp Makefile 24 | g++ -std=c++11 $(CXXFLAGS) -msse4.2 $(X) -g -o tests/bench/iteration tests/bench/iteration.cpp 25 | 26 | tests/fullsum: tests/fullsum.cpp include/csvmonkey.hpp Makefile 27 | g++ -std=c++11 $(CXXFLAGS) -msse4.2 $(X) -g -o tests/fullsum tests/fullsum.cpp 28 | 29 | clean: 30 | rm -f tests/fullsum tests/bench/iteration cachegrind* perf.data* *.gcda 31 | 32 | pgo: X+=-DNDEBUG 33 | pgo: 34 | g++ -std=c++11 $(CXXFLAGS) -DNDEBUG -fprofile-generate -msse4.2 $(X) -g -o tests/bench/iteration tests/bench/iteration.cpp 35 | ./tests/bench/iteration tests/data/profiledata.csv 36 | g++ -std=c++11 $(CXXFLAGS) -DNDEBUG -fprofile-use -msse4.2 $(X) -g -o tests/bench/iteration tests/bench/iteration.cpp 37 | 38 | grind: 39 | rm -f cachegrind.out.* 40 | valgrind --tool=cachegrind --branch-sim=yes ./tests/bench/iteration ram.64mb.csv 41 | cg_annotate --auto=yes cachegrind.out.* 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # csvmonkey 2 | 3 | This is a header-only vectorized, lazy-decoding, zero-copy CSV file parser. 4 | Given appropriate input data and hardware, the C++ version can tokenize ~1.9 5 | GiB/sec of input in one thread. For a basic summing task, the Python version is 6 | ~5x faster than `csv.reader` and ~10x faster than `csv.DictReader`, while 7 | maintaining a similarly usable interface. 8 | 9 | **This still requires a ton of work. For now it's mostly toy code.** 10 | 11 | Requires a CPU supporting Intel SSE4.2 and a C++11 compiler that bundles 12 | `smmintrin.h`. For non-SSE4.2 machines, a reasonable fallback implementation is 13 | also provided. 14 | 15 | As of writing, csvmonkey comfortably leads Ewan Higg's csv-game 17 | microbenchmark of 24 CSV parsers. 18 | 19 | 20 | ## How It Works 21 | 22 | * **Vectorized**: scanning for values that change parser state is done using 23 | Intel SSE 4.2 PCMPISTRI instruction. PCMPISTRI can locate the first occurence 24 | of up to four values within a 16 byte vector, allowing searching 16 input 25 | bytes for end of line, escape, quote, or field separators in one instruction. 26 | 27 | * **Zero Copy**: the user supplies the parser's input buffer. The output is an 28 | array of column offsets within a row, each flagged to indicate whether an 29 | escape character was detected. The highest throughput is achieved in 30 | combination with memory-mapped files, where none of the OS, application or 31 | parser make any bulk copies. 32 | 33 | * **Lazy Decoding**: input is not copied or unquoted until requested. Since a 34 | flag indicates the presence of escapes, a fast path can avoid any bytewise 35 | decode in the usual case where no escape is present. Due to lazy decoding, 36 | csvmonkey is extremely effective at tasks that scan only a subset of data, 37 | for example one or two columns from a large many-columned CSV. This use case 38 | is the original motivation for the design. 39 | 40 | * **Header Only**: the parser has no third-party dependencies, just some 41 | templates defined in ``csvmonkey.hpp``. 42 | 43 | 44 | ## Python Usage 45 | 46 | You can install the library through pip: 47 | 48 | ``` 49 | pip install csvmonkey 50 | ``` 51 | 52 | If this fails on ubuntu, first install `clang`: 53 | 54 | ``` 55 | sudo apt-get install clang 56 | ``` 57 | 58 | Then run: 59 | 60 | ``` 61 | CC=clang pip install csvmonkey 62 | ``` 63 | 64 | You can also install the library locally by cloning this repo and running: 65 | 66 | ``` 67 | pip install -e . 68 | ``` 69 | 70 | Then you can use it likewise: 71 | 72 | 1. `import csvmonkey` 73 | 2. `csvmonkey.from_path()` for a memory-mapped file, `csvmonkey.from_file()` 74 | for any file-like object with a `read()` method, or `csvmonkey.from_iter()` 75 | for any iterable object that yields lines or file chunks, e.g. 76 | `from_iter(open("ram.csv"))`. 77 | 78 | By default a magical `Row` object is yielded during iteration. This object is 79 | only a window into the currently parsed data, and becomes invalid upon the next 80 | iteration. Row data is accessed either by index or by key (if `header=True`) 81 | using: 82 | 83 | ``` 84 | for row in csvmonkey.from_path("ram.csv", header=True): 85 | row[20] # by index 86 | row["UnBlendedCost"] # by header value 87 | ``` 88 | 89 | If your CSV contains a header, specify `header=True` to read it during 90 | construction. If the CSV lacks a header but dict-like behaviour is desired, 91 | pass a header explicitly as `header=("a", "b", "c", "d")`. 92 | 93 | Element access causes the relevant chunk of the row to be copied to the heap 94 | and returned as a Python string. 95 | 96 | Rows may be converted to dicts via `row.asdict()`, tuples via 97 | `row.astuple()` or lists via `row.aslist()`. If you want rows to be produced 98 | directly in concrete form, pass `yields="list"`, `yields="tuple"`, 99 | `yields="dict"` keyword arguments. 100 | 101 | 102 | ### Unicode 103 | 104 | Unicode is supported for character sets where delimiters and line endings are 105 | represented by one byte. To configure Unicode, pass an ``encoding`` parameter, 106 | and optionally an ``errors`` parameter. 107 | 108 | * "bytes": Return bytes (default on Python 2) 109 | * "utf-8": Decode as UTF-8 (default on Python 3) 110 | * "ascii": Decode as ASCII 111 | * "latin1": Decode as LATIN1 112 | * "locale": Decode according to the active C locale 113 | * "...": Decode according some codec "..." known to Python 114 | 115 | Where possible, prefer exact spelling and case matching one of the above 116 | encodings, to ensure an associated fast path is used. 117 | 118 | 119 | ## Python Benchmark 120 | 121 | ram.csv is 614MiB with 1,540,093 records of 22 columns and approximately 418 122 | bytes per record. An anonymized version is checked into LFS as 123 | ``tests/data/anon-ram.csv.zstd``. 124 | 125 | Python 2.7 Sum: convert to float and sum single column: 126 | 127 | | Mode | Rate | Ratio | i7-6700HQ | Xeon E5530 | Core i5-2435M | 128 | |--------------------------|------------|-------|-----------|------------|---------------| 129 | | csvmonkey lazy decode | 1098 MiB/s | - | 0.559s | 0.9s | 1.29s | 130 | | csvmonkey yields="tuple" | 642 MiB/s | 1.7x | 0.956s | 1.87s | 2.17s | 131 | | csvmonkey yields="dict" | 281 MiB/s | 3.8x | 2.18s | 4.57s | 5.04s | 132 | | csv.reader | 223 MiB/s | 4.9x | 2.75s | 5.88s | 11.1s | 133 | | csv.DictReader | 85 MiB/s | 12.7x | 7.15s | 16.3s | 25.0s | 134 | 135 | Python 2.7 No-op: Iterate complete file, no other processing: 136 | 137 | | Mode | Rate | Ratio | i7-6700HQ | Xeon E5530 | 138 | |--------------------------|------------|-------|-----------|------------| 139 | | csvmonkey lazy decode | 1906 MiB/s | - | 0.322s | 0.444s | 140 | | csvmonkey yields="tuple" | 831 MiB/s | 2.3x | 0.738s | 1.4s | 141 | | csvmonkey yields="dict" | 318 MiB/s | 6.0x | 1.93s | 4.26s | 142 | | csv.reader | 248 MiB/s | 7.6x | 2.47s | 5.31s | 143 | | csv.DictReader | 92 MiB/s | 20.5x | 6.62s | 15.2s | 144 | 145 | Python 3.6 No-op: Iterate complete file, includes charset decoding 146 | 147 | | Mode | Rate | Ratio | i7-6700HQ | 148 | |---------------------------------------------|------------|------------|------------| 149 | | csvmonkey lazy decode | 1906 MiB/s | - | 0.322s | 150 | | csvmonkey yields="tuple", encoding="bytes" | 833 MiB/s | 2.3x | 0.737s | 151 | | csvmonkey yields="tuple", encoding="latin1" | 579 MiB/s | 3.3x | 1.06s | 152 | | csvmonkey yields="tuple" | 495 MiB/s | 3.8x | 1.24s | 153 | | csvmonkey yields="dict" | 235 MiB/s | 8.1x | 2.61s | 154 | | csv.reader | 121 MiB/s | 15.7x | 5.07s | 155 | | csv.DictReader | 55 MiB/s | 34.4x | 11.1s | 156 | 157 | 158 | ### Command lines 159 | 160 | Sum: 161 | 162 | ``` 163 | python -mtimeit -n1 -r3 -s 'import csvmonkey' 'sum(float(row["UnBlendedCost"]) for row in csvmonkey.from_path("ram.csv", header=True))' 164 | python -mtimeit -n1 -r3 -s 'import csvmonkey' 'sum(float(row[20]) for row in csvmonkey.from_path("ram.csv", header=True, yields="tuple"))' 165 | python -mtimeit -n1 -r3 -s 'import csvmonkey' 'sum(float(row["UnBlendedCost"]) for row in csvmonkey.from_path("ram.csv", header=True, yields="dict"))' 166 | python -mtimeit -n1 -r3 -s 'import csv' 'r = csv.reader(open("ram.csv")); next(r); sum(float(row[20]) for row in r)' 167 | python -mtimeit -n1 -r3 -s 'import csv' 'sum(float(row["UnBlendedCost"]) for row in csv.DictReader(open("ram.csv")))' 168 | ``` 169 | 170 | No-op: 171 | 172 | ``` 173 | python -mtimeit -n1 -r3 -s 'import csvmonkey' 'all(csvmonkey.from_path("ram.csv", header=True))' 174 | python -mtimeit -n1 -r3 -s 'import csvmonkey' 'all(csvmonkey.from_path("ram.csv", header=True, yields="tuple"))' 175 | python -mtimeit -n1 -r3 -s 'import csvmonkey' 'all(csvmonkey.from_path("ram.csv", header=True, yields="dict"))' 176 | python -mtimeit -n1 -r3 -s 'import csv' 'all(csv.reader(open("ram.csv")))' 177 | python -mtimeit -n1 -r3 -s 'import csv' 'all(csv.DictReader(open("ram.csv")))' 178 | ``` 179 | 180 | 181 | ## C++ Usage 182 | 183 | 1. Copy `csvmonkey.hpp` to your project and include it. 184 | 1. `CFLAGS=-msse4.2 -O3` 185 | 1. See `Makefile` for an example of producing a profile-guided build (worth an 186 | extra few %). 187 | 1. Instantiate `MappedFileCursor` (zero copy) or `FdStreamCursor` (buffered), attach it to a `CsvReader`. 188 | 1. Invoke `read_row()` and use `row().by_value()` to pick out `CsvCell` pointers for your desired columns. 189 | 1. Pump `read_row()` in a loop and use cell's `ptr()`, `size()`, `as_str()`, `equals()` and `as_double()` methods while `read_row()` returns true. 190 | 191 | 192 | # TODO 193 | 194 | * COW pointer interface to `as_str()`. 195 | * ~~Finish Python 3 support~~ 196 | * ~~Ensure Python ReaderObject is always 16-byte aligned~~ 197 | * Fix handling of last row when it: 198 | * lacks newline, or 199 | * is truncated after final quote, or 200 | * is truncated within a quote, or 201 | * is truncated within an escape 202 | * Restartable: fix quadratic behaviour when `StreamCursor` yields lines and CSV 203 | rows span lines 204 | * ~~Python `from_file()` that uses `read()` in preference to `__iter__()`.~~ 205 | * ~~Fix CRLF / LFCR handling.~~ 206 | * ~~`StreamCursor` error / exception propagation.~~ 207 | * ~~Remove hard 256 column limit & fix crash if it's exceeded.~~ 208 | * ~~Ensure non-SSE fallback return codes match SSE when not found.~~ 209 | * ~~Map single zero page after file pages in MappedFileCursor~~ 210 | * ~~Add trailing 16 NUL bytes to BufferedStreamCursor~~ 211 | * ~~Remove hard-coded page size~~ 212 | * ~~(Single byte separator) Unicode support.~~ 213 | * (Multi byte separator) Unicode support. 214 | -------------------------------------------------------------------------------- /cpython/csvmonkey.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | 4 | #include "csvmonkey.hpp" 5 | #include "iterator_stream_cursor.hpp" 6 | #include "file_stream_cursor.hpp" 7 | 8 | using namespace csvmonkey; 9 | 10 | extern PyTypeObject CellType; 11 | extern PyTypeObject ReaderType; 12 | extern PyTypeObject RowType; 13 | struct RowObject; 14 | 15 | 16 | enum CursorType 17 | { 18 | CURSOR_MAPPED_FILE, 19 | CURSOR_ITERATOR, 20 | CURSOR_PYTHON_FILE 21 | }; 22 | 23 | 24 | typedef PyObject *(*to_string_fn)(struct ReaderObject *, CsvCell *); 25 | 26 | struct ReaderObject 27 | { 28 | PyObject_HEAD 29 | CursorType cursor_type; 30 | StreamCursor *cursor; 31 | // CsvReader cannot be inline because pymalloc does not satisfy alignment 32 | // requirement 33 | CsvReader<> *reader; 34 | to_string_fn to_string; 35 | PyObject *(*yields)(RowObject *); 36 | int header; 37 | size_t record; // Current record number 38 | 39 | CsvCursor *row; 40 | PyObject *py_row; 41 | 42 | // Unicode. 43 | const char *encoding; // unused unless to_string==cell_to_unicode 44 | const char *errors; // "strict", "ignore", "..." 45 | 46 | // Map header string -> index. 47 | PyObject *header_map; 48 | }; 49 | 50 | 51 | struct CellObject 52 | { 53 | PyObject_HEAD; 54 | ReaderObject *reader; // strong ref. 55 | CsvCell *cell; 56 | }; 57 | 58 | 59 | struct RowObject 60 | { 61 | PyObject_HEAD; 62 | ReaderObject *reader; // strong ref, keeps row alive. 63 | CsvCursor *row; 64 | }; 65 | 66 | 67 | /* 68 | * String factories. 69 | */ 70 | 71 | #if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 3 72 | # define HAS_LOCALE 73 | #endif 74 | 75 | 76 | static PyObject * 77 | cell_to_bytes(ReaderObject *reader, CsvCell *cell) 78 | { 79 | if(! cell->escaped) { 80 | return PyBytes_FromStringAndSize(cell->ptr, cell->size); 81 | } 82 | auto s = cell->as_str(); 83 | return PyBytes_FromStringAndSize(&s[0], s.size()); 84 | } 85 | 86 | 87 | static PyObject * 88 | cell_to_utf8(ReaderObject *reader, CsvCell *cell) 89 | { 90 | if(! cell->escaped) { 91 | return PyUnicode_DecodeUTF8(cell->ptr, cell->size, reader->errors); 92 | } 93 | auto s = cell->as_str(); 94 | return PyUnicode_DecodeUTF8(&s[0], s.size(), reader->errors); 95 | } 96 | 97 | 98 | static PyObject * 99 | cell_to_ascii(ReaderObject *reader, CsvCell *cell) 100 | { 101 | if(! cell->escaped) { 102 | return PyUnicode_DecodeASCII(cell->ptr, cell->size, reader->errors); 103 | } 104 | auto s = cell->as_str(); 105 | return PyUnicode_DecodeASCII(&s[0], s.size(), reader->errors); 106 | } 107 | 108 | 109 | static PyObject * 110 | cell_to_latin1(ReaderObject *reader, CsvCell *cell) 111 | { 112 | if(! cell->escaped) { 113 | return PyUnicode_DecodeLatin1(cell->ptr, cell->size, reader->errors); 114 | } 115 | auto s = cell->as_str(); 116 | return PyUnicode_DecodeLatin1(&s[0], s.size(), reader->errors); 117 | } 118 | 119 | 120 | #ifdef HAS_LOCALE 121 | static PyObject * 122 | cell_to_locale(ReaderObject *reader, CsvCell *cell) 123 | { 124 | if(! cell->escaped) { 125 | return PyUnicode_DecodeLocaleAndSize(cell->ptr, cell->size, reader->errors); 126 | } 127 | auto s = cell->as_str(); 128 | return PyUnicode_DecodeLocaleAndSize(&s[0], s.size(), reader->errors); 129 | } 130 | #endif 131 | 132 | 133 | static PyObject * 134 | cell_to_unicode(ReaderObject *reader, CsvCell *cell) 135 | { 136 | if(! cell->escaped) { 137 | return PyUnicode_Decode(cell->ptr, cell->size, 138 | reader->encoding, reader->errors); 139 | } 140 | auto s = cell->as_str(); 141 | return PyUnicode_Decode(&s[0], s.size(), reader->encoding, reader->errors); 142 | } 143 | 144 | 145 | /* 146 | * Cell methods. 147 | */ 148 | 149 | static PyObject * 150 | cell_as_double(CellObject *self) 151 | { 152 | return PyFloat_FromDouble(self->cell->as_double()); 153 | } 154 | 155 | 156 | static PyObject * 157 | cell_as_str(CellObject *self) 158 | { 159 | ReaderObject *r = self->reader; 160 | return r->to_string(r, self->cell); 161 | } 162 | 163 | static PyObject * 164 | cell_equals(CellObject *self, PyObject *args) 165 | { 166 | if(! PyTuple_GET_SIZE(args)) { 167 | return NULL; 168 | } 169 | 170 | PyObject *py_s = PyTuple_GET_ITEM(args, 0); 171 | if(! PyBytes_CheckExact(py_s)) { 172 | return NULL; 173 | } 174 | 175 | const char *s = PyBytes_AS_STRING(py_s); 176 | PyObject *py_true = ( 177 | self->cell->equals(s) ? 178 | Py_True : 179 | Py_False 180 | ); 181 | Py_INCREF(py_true); 182 | return py_true; 183 | } 184 | 185 | 186 | static int 187 | cell_compare(PyObject *self_, PyObject *o2) 188 | { 189 | CellObject *self = (CellObject *) self_; 190 | if(! PyBytes_CheckExact(o2)) { 191 | return -1; 192 | } 193 | 194 | const char *s = PyBytes_AS_STRING(o2); 195 | return self->cell->equals(s) ? 0 : -1; 196 | } 197 | 198 | 199 | static PyObject * 200 | cell_richcmp(PyObject *self_, PyObject *o2, int op) 201 | { 202 | CellObject *self = (CellObject *) self_; 203 | if(! PyBytes_CheckExact(o2)) { 204 | return NULL; 205 | } 206 | 207 | PyObject *out = Py_NotImplemented; 208 | if(op == Py_EQ) { 209 | const char *s = PyBytes_AS_STRING(o2); 210 | out = self->cell->equals(s) ? Py_True : Py_False; 211 | } 212 | 213 | Py_INCREF(out); 214 | return out; 215 | } 216 | 217 | 218 | static PyObject * 219 | cell_new(ReaderObject *reader, CsvCell *cell) 220 | { 221 | CellObject *py_cell = (CellObject *) PyObject_New(CellObject, &CellType); 222 | if(! py_cell) { 223 | return NULL; 224 | } 225 | 226 | Py_INCREF(reader); 227 | py_cell->reader = reader; 228 | py_cell->cell = cell; 229 | return (PyObject *) py_cell; 230 | } 231 | 232 | 233 | static void 234 | cell_dealloc(PyObject *self_) 235 | { 236 | CellObject *self = (CellObject *) self_; 237 | Py_DECREF(self->reader); 238 | self->reader = NULL; 239 | self->cell = NULL; 240 | } 241 | 242 | 243 | /* 244 | * Row methods 245 | */ 246 | 247 | static int 248 | row_clear(RowObject *self) 249 | { 250 | Py_CLEAR(self->reader); 251 | return 0; 252 | } 253 | 254 | 255 | static void 256 | row_dealloc(RowObject *self) 257 | { 258 | self->row = NULL; 259 | Py_CLEAR(self->reader); 260 | Py_TYPE(self)->tp_free((PyObject *)self); 261 | } 262 | 263 | 264 | static int 265 | row_traverse(RowObject *self, visitproc visit, void *arg) 266 | { 267 | Py_VISIT(self->reader); 268 | return 0; 269 | } 270 | 271 | 272 | static PyObject * 273 | row_new(ReaderObject *reader) 274 | { 275 | RowObject *self = (RowObject *) PyObject_GC_New(RowObject, &RowType); 276 | if(self) { 277 | Py_INCREF(reader); 278 | self->reader = reader; 279 | } 280 | self->row = reader->row; 281 | PyObject_GC_Track((PyObject *) self); 282 | return (PyObject *) self; 283 | } 284 | 285 | 286 | static PyObject * 287 | row_aslist(RowObject *self) 288 | { 289 | PyObject *lst = PyList_New(self->row->count); 290 | if(lst) { 291 | int count = self->row->count; 292 | CsvCell *cell = &self->row->cells[0]; 293 | ReaderObject *r = self->reader; 294 | to_string_fn to_string = r->to_string; 295 | 296 | for(int i = 0; i < count; i++, cell++) { 297 | PyObject *s = to_string(r, cell); 298 | if(! s) { 299 | Py_CLEAR(lst); 300 | break; 301 | } 302 | PyList_SET_ITEM(lst, i, s); 303 | } 304 | } 305 | 306 | return lst; 307 | } 308 | 309 | 310 | static PyObject * 311 | row_astuple(RowObject *self) 312 | { 313 | PyObject *tup = PyTuple_New(self->row->count); 314 | if(tup) { 315 | int count = self->row->count; 316 | CsvCell *cell = &self->row->cells[0]; 317 | ReaderObject *r = self->reader; 318 | to_string_fn to_string = r->to_string; 319 | 320 | for(int i = 0; i < count; i++, cell++) { 321 | PyObject *s = to_string(r, cell); 322 | if(! s) { 323 | Py_CLEAR(tup); 324 | break; 325 | } 326 | PyTuple_SET_ITEM(tup, i, s); 327 | } 328 | } 329 | 330 | return tup; 331 | } 332 | 333 | 334 | static PyObject * 335 | row_asdict(RowObject *self) 336 | { 337 | if(! self->reader->header_map) { 338 | PyErr_Format(PyExc_TypeError, 339 | "Cannot convert to dict; no header is present"); 340 | return NULL; 341 | } 342 | 343 | PyObject *out = PyDict_New(); 344 | if(out) { 345 | Py_ssize_t ppos = 0; 346 | PyObject *key; 347 | PyObject *value; 348 | 349 | CsvCell *cells = &self->row->cells[0]; 350 | ReaderObject *r = self->reader; 351 | to_string_fn to_string = r->to_string; 352 | 353 | while(PyDict_Next(self->reader->header_map, &ppos, &key, &value)) { 354 | int i = PyLong_AsLong(value); 355 | if(i < self->row->count) { 356 | PyObject *s = to_string(r, &cells[i]); 357 | if(! s) { 358 | Py_CLEAR(out); 359 | break; 360 | } 361 | 362 | if(PyDict_SetItem(out, key, s)) { 363 | Py_DECREF(s); 364 | Py_CLEAR(out); 365 | break; 366 | } 367 | 368 | Py_DECREF(s); 369 | } 370 | } 371 | } 372 | 373 | return out; 374 | } 375 | 376 | 377 | static PyObject * 378 | row_return_self(RowObject *self) 379 | { 380 | Py_INCREF(self); 381 | return (PyObject *) self; 382 | } 383 | 384 | 385 | static PyObject * 386 | row_repr(RowObject *self) 387 | { 388 | PyObject *obj; 389 | 390 | if(self->reader->header) { 391 | obj = row_asdict(self); 392 | } else { 393 | obj = row_astuple(self); 394 | } 395 | 396 | if(! obj) { 397 | return NULL; 398 | } 399 | 400 | PyObject *repr = PyUnicode_FromFormat( 401 | "", 402 | obj 403 | ); 404 | Py_DECREF(obj); 405 | return repr; 406 | } 407 | 408 | 409 | static Py_ssize_t 410 | row_length(RowObject *self) 411 | { 412 | return self->row->count; 413 | } 414 | 415 | 416 | static PyObject * 417 | row_getitem(RowObject *self, Py_ssize_t index) 418 | { 419 | if(index < 0) { 420 | index = self->row->count + index; 421 | } 422 | 423 | if(index < 0 || index > self->row->count) { 424 | PyErr_Format(PyExc_IndexError, 425 | "index %ld greater than parsed col count %lu", 426 | (unsigned long) index, 427 | (unsigned long) self->row->count); 428 | return NULL; 429 | } 430 | 431 | ReaderObject *r = self->reader; 432 | return r->to_string(r, &self->row->cells[index]); 433 | } 434 | 435 | 436 | static Py_ssize_t 437 | row_getlength(RowObject *self) 438 | { 439 | return (Py_ssize_t) self->row->count; 440 | } 441 | 442 | 443 | static PyObject * 444 | row_subscript(RowObject *self, PyObject *key) 445 | { 446 | int index; 447 | 448 | if(PyLong_Check(key)) { 449 | index = (int) PyLong_AsLong(key); 450 | if(index < 0) { 451 | index = self->row->count + index; 452 | } 453 | #if PY_MAJOR_VERSION < 3 454 | } else if(PyInt_Check(key)) { 455 | index = (int) PyInt_AS_LONG(key); 456 | if(index < 0) { 457 | index = self->row->count + index; 458 | } 459 | #endif 460 | } else if(! self->reader->header_map) { 461 | PyErr_Format(PyExc_IndexError, "Reader instantiated with header=False"); 462 | return NULL; 463 | } else { 464 | PyObject *py_index = PyDict_GetItem(self->reader->header_map, key); 465 | if(! py_index) { 466 | PyErr_Format(PyExc_KeyError, "No such key."); 467 | return NULL; 468 | } 469 | index = (int) PyLong_AsLong(py_index); 470 | } 471 | 472 | if(index < 0 || index > self->row->count) { 473 | PyErr_Format(PyExc_IndexError, 474 | "index %ld greater than parsed col count %lu", 475 | (unsigned long) index, 476 | (unsigned long) self->row->count); 477 | return NULL; 478 | } 479 | 480 | ReaderObject *r = self->reader; 481 | return r->to_string(r, &self->row->cells[index]); 482 | } 483 | 484 | 485 | static PyObject * 486 | row_iter(RowObject *self) 487 | { 488 | PyObject *tup = row_astuple(self); 489 | if(! tup) { 490 | return NULL; 491 | } 492 | 493 | PyObject *iter = PyObject_GetIter(tup); 494 | Py_DECREF(tup); 495 | return iter; 496 | } 497 | 498 | 499 | /* 500 | * Reader methods 501 | */ 502 | 503 | static int 504 | reader_clear(ReaderObject *self) 505 | { 506 | Py_CLEAR(self->py_row); 507 | Py_CLEAR(self->header_map); 508 | return 0; 509 | } 510 | 511 | 512 | static void 513 | delete_cursor(CursorType type, StreamCursor *cursor) 514 | { 515 | switch(type) { 516 | case CURSOR_MAPPED_FILE: 517 | delete (MappedFileCursor *)cursor; 518 | break; 519 | case CURSOR_ITERATOR: 520 | delete (IteratorStreamCursor *)cursor; 521 | break; 522 | case CURSOR_PYTHON_FILE: 523 | delete (FileStreamCursor *)cursor; 524 | break; 525 | default: 526 | assert(0); 527 | } 528 | } 529 | 530 | 531 | static void 532 | reader_dealloc(ReaderObject *self) 533 | { 534 | reader_clear(self); 535 | delete self->reader; 536 | delete_cursor(self->cursor_type, self->cursor); 537 | Py_TYPE(self)->tp_free((PyObject *)self); 538 | } 539 | 540 | 541 | static int 542 | reader_traverse(ReaderObject *self, visitproc visit, void *arg) 543 | { 544 | Py_VISIT(self->py_row); 545 | return 0; 546 | } 547 | 548 | 549 | static int 550 | header_from_first_row(ReaderObject *self) 551 | { 552 | if(! self->reader->read_row()) { 553 | if(! PyErr_Occurred()) { 554 | PyErr_Format(PyExc_IOError, "Could not read header row"); 555 | } 556 | return -1; 557 | } 558 | 559 | self->header_map = PyDict_New(); 560 | if(! self->header_map) { 561 | return -1; 562 | } 563 | 564 | CsvCell *cell = &self->row->cells[0]; 565 | for(int i = 0; i < self->row->count; i++) { 566 | PyObject *key = self->to_string(self, cell); 567 | PyObject *value = PyLong_FromLong(i); 568 | assert(key && value); 569 | PyDict_SetItem(self->header_map, key, value); 570 | Py_DECREF(key); 571 | Py_DECREF(value); 572 | cell++; 573 | } 574 | 575 | return 0; 576 | } 577 | 578 | 579 | static int 580 | header_from_sequence(ReaderObject *self, PyObject *header) 581 | { 582 | self->header_map = PyDict_New(); 583 | if(! self->header_map) { 584 | return -1; 585 | } 586 | 587 | Py_ssize_t length = PySequence_Length(header); 588 | for(Py_ssize_t i = 0; i < length; i++) { 589 | PyObject *key = PySequence_GetItem(header, i); 590 | if(! key) { 591 | return -1; 592 | } 593 | 594 | PyObject *value = PyLong_FromLong(i); 595 | if(! value) { 596 | return -1; 597 | } 598 | 599 | PyDict_SetItem(self->header_map, key, value); 600 | Py_DECREF(key); 601 | Py_DECREF(value); 602 | } 603 | 604 | return 0; 605 | } 606 | 607 | 608 | static PyObject * 609 | reader_from_cursor(CursorType cursor_type, 610 | StreamCursor *cursor, 611 | const char *yields, 612 | PyObject *header, 613 | char delimiter, 614 | char quotechar, 615 | char escapechar, 616 | bool yield_incomplete_row, 617 | const char *encoding, 618 | const char *errors) 619 | { 620 | ReaderObject *self = PyObject_GC_New(ReaderObject, &ReaderType); 621 | if(! self) { 622 | delete_cursor(cursor_type, cursor); 623 | Py_DECREF(self); 624 | return NULL; 625 | } 626 | 627 | self->cursor_type = cursor_type; 628 | self->cursor = cursor; 629 | self->record = 0; 630 | self->errors = errors; 631 | 632 | if(! strcmp(yields, "dict")) { 633 | self->yields = row_asdict; 634 | } else if(! strcmp(yields, "list")) { 635 | self->yields = row_aslist; 636 | } else if(! strcmp(yields, "tuple")) { 637 | self->yields = row_astuple; 638 | } else { 639 | self->yields = row_return_self; 640 | } 641 | 642 | self->header = header && PyObject_IsTrue(header); 643 | self->reader = new CsvReader<>( 644 | *self->cursor, 645 | delimiter, 646 | quotechar, 647 | escapechar, 648 | yield_incomplete_row 649 | ); 650 | self->row = &self->reader->row(); 651 | self->py_row = row_new(self); 652 | 653 | // Default to UTF-8 encoding on Python 3. 654 | #if PY_MAJOR_VERSION >= 3 655 | if(! encoding) { 656 | encoding = "utf-8"; 657 | } 658 | #endif 659 | 660 | if((! encoding) || (! strcmp(encoding, "bytes"))) { 661 | self->to_string = cell_to_bytes; 662 | } else if(! strcmp(encoding, "utf-8")) { 663 | self->to_string = cell_to_utf8; 664 | } else if(! strcmp(encoding, "ascii")) { 665 | self->to_string = cell_to_ascii; 666 | } else if(! strcmp(encoding, "latin1")) { 667 | self->to_string = cell_to_latin1; 668 | #ifdef HAS_LOCALE 669 | } else if(! strcmp(encoding, "locale")) { 670 | self->to_string = cell_to_locale; 671 | #endif 672 | } else { 673 | self->encoding = encoding; 674 | self->to_string = cell_to_unicode; 675 | } 676 | 677 | if(self->header) { 678 | int rc; 679 | if(PySequence_Check(header)) { 680 | rc = header_from_sequence(self, header); 681 | } else { 682 | rc = header_from_first_row(self); 683 | } 684 | 685 | if(rc) { 686 | Py_DECREF((PyObject *) self); 687 | return NULL; 688 | } 689 | } else { 690 | self->header_map = NULL; 691 | } 692 | 693 | PyObject_GC_Track((PyObject *) self); 694 | return (PyObject *) self; 695 | } 696 | 697 | 698 | static PyObject * 699 | reader_from_path(PyObject *_self, PyObject *args, PyObject *kw) 700 | { 701 | static char *keywords[] = {"path", "yields", "header", "delimiter", 702 | "quotechar", "escapechar", "yield_incomplete_row", 703 | "encoding", "errors"}; 704 | const char *path; 705 | const char *yields = "row"; 706 | PyObject *header = NULL; 707 | char delimiter = ','; 708 | char quotechar = '"'; 709 | char escapechar = 0; 710 | int yield_incomplete_row = 0; 711 | const char *encoding = 0; 712 | const char *errors = 0; 713 | 714 | if(! PyArg_ParseTupleAndKeywords(args, kw, "s|sOccciss:from_path", keywords, 715 | &path, &yields, &header, &delimiter, "echar, &escapechar, 716 | &yield_incomplete_row, &encoding, &errors)) { 717 | return NULL; 718 | } 719 | 720 | MappedFileCursor *cursor = new MappedFileCursor(); 721 | try { 722 | cursor->open(path); 723 | } catch(csvmonkey::Error &e) { 724 | delete cursor; 725 | PyErr_Format(PyExc_IOError, "%s: %s", path, e.what()); 726 | return NULL; 727 | } 728 | 729 | return reader_from_cursor( 730 | CURSOR_MAPPED_FILE, 731 | cursor, 732 | yields, 733 | header, 734 | delimiter, 735 | quotechar, 736 | escapechar, 737 | yield_incomplete_row, 738 | encoding, 739 | errors 740 | ); 741 | } 742 | 743 | 744 | static PyObject * 745 | reader_from_iter(PyObject *_self, PyObject *args, PyObject *kw) 746 | { 747 | static char *keywords[] = {"iter", "yields", "header", 748 | "delimiter", "quotechar", "escapechar", "yield_incomplete_row", 749 | "encoding", "errors"}; 750 | PyObject *iterable; 751 | const char *yields = "row"; 752 | PyObject *header = NULL; 753 | char delimiter = ','; 754 | char quotechar = '"'; 755 | char escapechar = 0; 756 | int yield_incomplete_row = 0; 757 | const char *encoding = 0; 758 | const char *errors = 0; 759 | 760 | if(! PyArg_ParseTupleAndKeywords(args, kw, "O|sOccciss:from_iter", 761 | keywords, 762 | &iterable, &yields, &header, &delimiter, "echar, &escapechar, 763 | &yield_incomplete_row, &encoding, &errors)) { 764 | return NULL; 765 | } 766 | 767 | PyObject *iter = PyObject_GetIter(iterable); 768 | if(! iter) { 769 | return NULL; 770 | } 771 | 772 | return reader_from_cursor( 773 | CURSOR_ITERATOR, 774 | new IteratorStreamCursor(iter), 775 | yields, 776 | header, 777 | delimiter, 778 | quotechar, 779 | escapechar, 780 | yield_incomplete_row, 781 | encoding, 782 | errors 783 | ); 784 | } 785 | 786 | 787 | static PyObject * 788 | reader_from_file(PyObject *_self, PyObject *args, PyObject *kw) 789 | { 790 | static char *keywords[] = {"fp", "yields", "header", 791 | "delimiter", "quotechar", "escapechar", "yield_incomplete_row", 792 | "encoding", "errors"}; 793 | PyObject *fp; 794 | const char *yields = "row"; 795 | PyObject *header = NULL; 796 | char delimiter = ','; 797 | char quotechar = '"'; 798 | char escapechar = 0; 799 | int yield_incomplete_row = 0; 800 | const char *encoding = 0; 801 | const char *errors = 0; 802 | 803 | if(! PyArg_ParseTupleAndKeywords(args, kw, "O|sOccciss:from_file", keywords, 804 | &fp, &yields, &header, &delimiter, "echar, &escapechar, 805 | &yield_incomplete_row, &encoding, &errors)) { 806 | return NULL; 807 | } 808 | 809 | PyObject *py_read = PyObject_GetAttrString(fp, "read"); 810 | if(! py_read) { 811 | CSM_DEBUG("py_read is null"); 812 | return NULL; 813 | } 814 | 815 | return reader_from_cursor( 816 | CURSOR_PYTHON_FILE, 817 | new FileStreamCursor(py_read), 818 | yields, 819 | header, 820 | delimiter, 821 | quotechar, 822 | escapechar, 823 | yield_incomplete_row, 824 | encoding, 825 | errors 826 | ); 827 | } 828 | 829 | 830 | static PyObject * 831 | reader_get_header(ReaderObject *self, PyObject *args) 832 | { 833 | if(! self->header_map) { 834 | return PyList_New(0); 835 | } 836 | 837 | PyObject *lst = PyList_New(PyDict_Size(self->header_map)); 838 | Py_ssize_t ppos = 0; 839 | PyObject *key; 840 | PyObject *value; 841 | 842 | while(PyDict_Next(self->header_map, &ppos, &key, &value)) { 843 | int i = PyLong_AsLong(value); 844 | PyList_SET_ITEM(lst, i, key); 845 | Py_INCREF(key); 846 | } 847 | 848 | return lst; 849 | } 850 | 851 | 852 | static PyObject * 853 | reader_find_cell(ReaderObject *self, PyObject *args) 854 | { 855 | const char *s; 856 | if(! PyArg_ParseTuple(args, "s:find_cell", &s)) { 857 | return NULL; 858 | } 859 | 860 | CsvCell *cell; 861 | if(! self->row->by_value(s, cell)) { 862 | PyErr_Format(PyExc_KeyError, "%s", s); 863 | return NULL; 864 | } 865 | 866 | return cell_new(self, cell); 867 | } 868 | 869 | 870 | static PyObject * 871 | reader_repr(ReaderObject *self) 872 | { 873 | return PyUnicode_FromFormat( 874 | "", 875 | self->record 876 | ); 877 | } 878 | 879 | 880 | static PyObject * 881 | reader_iter(PyObject *self) 882 | { 883 | Py_INCREF(self); 884 | return self; 885 | } 886 | 887 | 888 | static PyObject * 889 | reader_iternext(ReaderObject *self) 890 | { 891 | if(self->reader->read_row()) { 892 | self->record++; 893 | return self->yields((RowObject *) self->py_row); 894 | } 895 | 896 | if(self->cursor->size() && !self->reader->in_newline_skip) { 897 | PyErr_Format(PyExc_IOError, 898 | "%lu unparsed bytes at end of input. The input may be missing a " 899 | "final newline, or unbalanced quotes are present.", 900 | (unsigned long) self->cursor->size() 901 | ); 902 | } 903 | if(! PyErr_Occurred()) { 904 | PyErr_SetNone(PyExc_StopIteration); 905 | } 906 | return NULL; 907 | } 908 | 909 | 910 | /* 911 | * Cell Type. 912 | */ 913 | 914 | static PyMethodDef cell_methods[] = { 915 | {"as_double", (PyCFunction)cell_as_double, METH_NOARGS, ""}, 916 | {"as_str", (PyCFunction)cell_as_str, METH_NOARGS, ""}, 917 | {"equals", (PyCFunction)cell_equals, METH_VARARGS, ""}, 918 | {0, 0, 0, 0} 919 | }; 920 | 921 | PyTypeObject CellType = { 922 | PyVarObject_HEAD_INIT(NULL, 0) 923 | .tp_name = "_Cell", 924 | .tp_basicsize = sizeof(CellObject), 925 | .tp_dealloc = cell_dealloc, 926 | #if PY_MAJOR_VERSION < 3 927 | .tp_compare = cell_compare, 928 | #endif 929 | .tp_flags=Py_TPFLAGS_DEFAULT, 930 | .tp_doc="csvmonkey._Cell", 931 | .tp_richcompare=cell_richcmp, 932 | .tp_methods=cell_methods, 933 | }; 934 | 935 | 936 | /* 937 | * Row type. 938 | */ 939 | 940 | static PySequenceMethods row_sequence_methods = { 941 | (lenfunc) row_getlength, /* sq_length */ 942 | NULL, /* sq_concat */ 943 | NULL, /* sq_repeat */ 944 | (ssizeargfunc) row_getitem, /* sq_item */ 945 | }; 946 | 947 | 948 | static PyMappingMethods row_mapping_methods = { 949 | (lenfunc) row_getlength, /* mp_length */ 950 | (binaryfunc) row_subscript, /* mp_subscript */ 951 | 0, /* mp_ass_subscript */ 952 | }; 953 | 954 | static PyMethodDef row_methods[] = { 955 | {"aslist", (PyCFunction)row_aslist, METH_NOARGS, ""}, 956 | {"astuple", (PyCFunction)row_astuple, METH_NOARGS, ""}, 957 | {"asdict", (PyCFunction)row_asdict, METH_NOARGS, ""}, 958 | {0, 0, 0, 0} 959 | }; 960 | 961 | PyTypeObject RowType = { 962 | PyVarObject_HEAD_INIT(NULL, 0) 963 | "_Row", /*tp_name*/ 964 | sizeof(RowObject), /*tp_basicsize*/ 965 | 0, /*tp_itemsize*/ 966 | (destructor) row_dealloc, /*tp_dealloc*/ 967 | 0, /*tp_print*/ 968 | 0, /*tp_getattr*/ 969 | 0, /*tp_setattr*/ 970 | 0, /*tp_compare*/ 971 | (reprfunc)row_repr, /*tp_repr*/ 972 | 0, /*tp_as_number*/ 973 | &row_sequence_methods, /*tp_as_sequence*/ 974 | &row_mapping_methods, /*tp_as_mapping*/ 975 | 0, /*tp_hash*/ 976 | 0, /*tp_call*/ 977 | 0, /*tp_str*/ 978 | 0, /*tp_getattro*/ 979 | 0, /*tp_setattro*/ 980 | 0, /*tp_as_buffer*/ 981 | Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_GC, /*tp_flags*/ 982 | "csvmonkey._Row", /*tp_doc*/ 983 | (traverseproc)row_traverse, /*tp_traverse*/ 984 | (inquiry)row_clear, /*tp_clear*/ 985 | 0, /*tp_richcompare*/ 986 | 0, /*tp_weaklistoffset*/ 987 | (getiterfunc) row_iter, /*tp_iter*/ 988 | 0, /*tp_iternext*/ 989 | row_methods, /*tp_methods*/ 990 | 0, /*tp_members*/ 991 | 0, /*tp_getset*/ 992 | 0, /*tp_base*/ 993 | 0, /*tp_dict*/ 994 | 0, /*tp_descr_get*/ 995 | 0, /*tp_descr_set*/ 996 | 0, /*tp_dictoffset*/ 997 | 0, /*tp_init*/ 998 | 0, /*tp_alloc*/ 999 | 0, /*tp_new*/ 1000 | 0, /*tp_free*/ 1001 | }; 1002 | 1003 | 1004 | /* 1005 | * Reader type. 1006 | */ 1007 | 1008 | static PyMethodDef reader_methods[] = { 1009 | {"get_header", (PyCFunction)reader_get_header, METH_NOARGS, ""}, 1010 | {"find_cell", (PyCFunction)reader_find_cell, METH_VARARGS, ""}, 1011 | {0, 0, 0, 0} 1012 | }; 1013 | 1014 | PyTypeObject ReaderType = { 1015 | PyVarObject_HEAD_INIT(NULL, 0) 1016 | "_Reader", /*tp_name*/ 1017 | sizeof(ReaderObject), /*tp_basicsize*/ 1018 | 0, /*tp_itemsize*/ 1019 | (destructor) reader_dealloc,/*tp_dealloc*/ 1020 | 0, /*tp_print*/ 1021 | 0, /*tp_getattr*/ 1022 | 0, /*tp_setattr*/ 1023 | 0, /*tp_compare*/ 1024 | (reprfunc)reader_repr, /*tp_repr*/ 1025 | 0, /*tp_as_number*/ 1026 | 0, /*tp_as_sequence*/ 1027 | 0, /*tp_as_mapping*/ 1028 | 0, /*tp_hash*/ 1029 | 0, /*tp_call*/ 1030 | 0, /*tp_str*/ 1031 | 0, /*tp_getattro*/ 1032 | 0, /*tp_setattro*/ 1033 | 0, /*tp_as_buffer*/ 1034 | Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_GC, /*tp_flags*/ 1035 | "csvmonkey._Reader", /*tp_doc*/ 1036 | (traverseproc)reader_traverse, /*tp_traverse*/ 1037 | (inquiry)reader_clear, /*tp_clear*/ 1038 | 0, /*tp_richcompare*/ 1039 | 0, /*tp_weaklistoffset*/ 1040 | (getiterfunc) reader_iter, /*tp_iter*/ 1041 | (iternextfunc) reader_iternext, /*tp_iternext*/ 1042 | reader_methods, /*tp_methods*/ 1043 | 0, /*tp_members*/ 1044 | 0, /*tp_getset*/ 1045 | 0, /*tp_base*/ 1046 | 0, /*tp_dict*/ 1047 | 0, /*tp_descr_get*/ 1048 | 0, /*tp_descr_set*/ 1049 | 0, /*tp_dictoffset*/ 1050 | 0, /*tp_init*/ 1051 | 0, /*tp_alloc*/ 1052 | 0, /*tp_new*/ 1053 | 0, /*tp_free*/ 1054 | }; 1055 | 1056 | 1057 | /* 1058 | * Module constructor. 1059 | */ 1060 | 1061 | static struct PyMethodDef module_methods[] = { 1062 | {"from_path", (PyCFunction) reader_from_path, METH_VARARGS|METH_KEYWORDS}, 1063 | {"from_iter", (PyCFunction) reader_from_iter, METH_VARARGS|METH_KEYWORDS}, 1064 | {"from_file", (PyCFunction) reader_from_file, METH_VARARGS|METH_KEYWORDS}, 1065 | {0, 0, 0, 0} 1066 | }; 1067 | 1068 | #if PY_MAJOR_VERSION >= 3 1069 | static struct PyModuleDef moduledef = { 1070 | PyModuleDef_HEAD_INIT, 1071 | "csvmonkey", 1072 | NULL, 1073 | -1, 1074 | module_methods, 1075 | NULL, 1076 | NULL, 1077 | NULL, 1078 | NULL 1079 | }; 1080 | # define MOD_RETURN(mod) return mod; 1081 | # define MODINIT_NAME PyInit_csvmonkey 1082 | #else 1083 | # define MODINIT_NAME initcsvmonkey 1084 | # define MOD_RETURN(mod) return 1085 | #endif 1086 | 1087 | 1088 | PyMODINIT_FUNC 1089 | MODINIT_NAME(void) 1090 | { 1091 | static PyTypeObject *types[] = { 1092 | &CellType, &RowType, &ReaderType 1093 | }; 1094 | 1095 | #if PY_MAJOR_VERSION >= 3 1096 | PyObject *mod = PyModule_Create(&moduledef); 1097 | #else 1098 | PyObject *mod = Py_InitModule3("csvmonkey", module_methods, ""); 1099 | #endif 1100 | if(! mod) { 1101 | MOD_RETURN(NULL); 1102 | } 1103 | 1104 | for(int i = 0; i < (sizeof types / sizeof types[0]); i++) { 1105 | PyTypeObject *type = types[i]; 1106 | if(PyType_Ready(type)) { 1107 | MOD_RETURN(NULL); 1108 | } 1109 | if(PyModule_AddObject(mod, type->tp_name, (PyObject *)type)) { 1110 | MOD_RETURN(NULL); 1111 | } 1112 | } 1113 | 1114 | MOD_RETURN(mod); 1115 | } 1116 | -------------------------------------------------------------------------------- /cpython/file_stream_cursor.hpp: -------------------------------------------------------------------------------- 1 | 2 | class FileStreamCursor 3 | : public csvmonkey::BufferedStreamCursor 4 | { 5 | PyObject *args_tuple_; 6 | PyObject *read_; 7 | 8 | public: 9 | FileStreamCursor(PyObject *read) 10 | : BufferedStreamCursor() 11 | , read_(read) 12 | , args_tuple_(Py_BuildValue("(i)", 65536)) 13 | { 14 | assert(args_tuple_ != 0); 15 | } 16 | 17 | ~FileStreamCursor() 18 | { 19 | Py_DECREF(args_tuple_); 20 | Py_DECREF(read_); 21 | } 22 | 23 | virtual ssize_t readmore() 24 | { 25 | PyObject *result = PyObject_Call(read_, args_tuple_, NULL); 26 | CSM_DEBUG("result = %lu", result); 27 | if(! result) { 28 | return -1; 29 | } 30 | 31 | if(! PyBytes_CheckExact(result)) { 32 | PyErr_SetString(PyExc_TypeError, 33 | "CSV iterable must yield exactly a string."); 34 | Py_DECREF(result); 35 | return -1; 36 | } 37 | 38 | Py_ssize_t sz = PyBytes_GET_SIZE(result); 39 | if(! sz) { 40 | return -1; 41 | } 42 | 43 | ensure(sz); 44 | memcpy(&vec_[write_pos_], PyBytes_AS_STRING(result), sz); 45 | Py_DECREF(result); 46 | return sz; 47 | } 48 | }; 49 | -------------------------------------------------------------------------------- /cpython/iterator_stream_cursor.hpp: -------------------------------------------------------------------------------- 1 | 2 | class IteratorStreamCursor 3 | : public csvmonkey::BufferedStreamCursor 4 | { 5 | PyObject *iter_; 6 | 7 | public: 8 | IteratorStreamCursor(PyObject *iter) 9 | : BufferedStreamCursor() 10 | , iter_(iter) 11 | { 12 | } 13 | 14 | ~IteratorStreamCursor() 15 | { 16 | Py_DECREF(iter_); 17 | } 18 | 19 | virtual ssize_t readmore() 20 | { 21 | PyObject *result = PyIter_Next(iter_); 22 | if(! result) { 23 | return -1; 24 | } 25 | 26 | if(! PyBytes_CheckExact(result)) { 27 | PyErr_SetString(PyExc_TypeError, 28 | "CSV iterable must yield exactly a string."); 29 | Py_DECREF(result); 30 | return -1; 31 | } 32 | 33 | Py_ssize_t sz = PyBytes_GET_SIZE(result); 34 | if(! sz) { 35 | return -1; 36 | } 37 | 38 | ensure(sz); 39 | memcpy(&vec_[write_pos_], PyBytes_AS_STRING(result), sz); 40 | Py_DECREF(result); 41 | return sz; 42 | } 43 | }; 44 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | default: 5 | sphinx-build . build/html/ 6 | 7 | # You can set these variables from the command line. 8 | SPHINXOPTS = 9 | SPHINXBUILD = sphinx-build 10 | PAPER = 11 | BUILDDIR = build 12 | 13 | # User-friendly check for sphinx-build 14 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 15 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from https://sphinx-doc.org/) 16 | endif 17 | 18 | # Internal variables. 19 | PAPEROPT_a4 = -D latex_paper_size=a4 20 | PAPEROPT_letter = -D latex_paper_size=letter 21 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 22 | # the i18n builder cannot share the environment and doctrees with the others 23 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 24 | 25 | .PHONY: help 26 | help: 27 | @echo "Please use \`make ' where is one of" 28 | @echo " html to make standalone HTML files" 29 | @echo " dirhtml to make HTML files named index.html in directories" 30 | @echo " changes to make an overview of all changed/added/deprecated items" 31 | @echo " linkcheck to check all external links for integrity" 32 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 33 | @echo " coverage to run coverage check of the documentation (if enabled)" 34 | 35 | .PHONY: clean 36 | clean: 37 | rm -rf $(BUILDDIR)/* 38 | 39 | .PHONY: html 40 | html: 41 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 42 | @echo 43 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 44 | 45 | .PHONY: dirhtml 46 | dirhtml: 47 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 48 | @echo 49 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 50 | 51 | .PHONY: changes 52 | changes: 53 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 54 | @echo 55 | @echo "The overview file is in $(BUILDDIR)/changes." 56 | 57 | .PHONY: linkcheck 58 | linkcheck: 59 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 60 | @echo 61 | @echo "Link check complete; look for any errors in the above output " \ 62 | "or in $(BUILDDIR)/linkcheck/output.txt." 63 | 64 | .PHONY: doctest 65 | doctest: 66 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 67 | @echo "Testing of doctests in the sources finished, look at the " \ 68 | "results in $(BUILDDIR)/doctest/output.txt." 69 | 70 | .PHONY: coverage 71 | coverage: 72 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 73 | @echo "Testing of coverage in the sources finished, look at the " \ 74 | "results in $(BUILDDIR)/coverage/python.txt." 75 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | author = u'David Wilson' 2 | copyright = u'2019, David Wilson' 3 | exclude_patterns = ['build'] 4 | html_show_sourcelink = False 5 | html_show_sphinx = False 6 | html_sidebars = {'**': ['globaltoc.html', 'github.html']} 7 | html_static_path = ['static'] 8 | html_theme_path = ['.'] 9 | html_theme = 'alabaster' 10 | html_theme_options = { 11 | 'font_family': "Georgia, serif", 12 | 'head_font_family': "Georgia, serif", 13 | 'fixed_sidebar': True, 14 | 'show_powered_by': False, 15 | 'pink_2': 'fffafaf', 16 | 'pink_1': '#fff0f0', 17 | } 18 | htmlhelp_basename = 'csvmonkeydoc' 19 | language = None 20 | master_doc = 'index' 21 | project = u'csvmonkey' 22 | pygments_style = 'sphinx' 23 | source_suffix = '.rst' 24 | templates_path = ['templates'] 25 | todo_include_todos = False 26 | version = '0.0.2' 27 | release = version 28 | -------------------------------------------------------------------------------- /docs/cpp.rst: -------------------------------------------------------------------------------- 1 | 2 | C++ API 3 | ======= 4 | 5 | .. default-domain:: cpp 6 | 7 | 8 | 9 | Errors 10 | ------ 11 | 12 | .. class:: csvmonkey::Error : public std::exception 13 | 14 | Thrown in various places during setup, due to failed file IO or field 15 | extraction. 16 | 17 | .. function:: const char \*what() const 18 | 19 | Describe the reason for the error. 20 | 21 | 22 | Cursors 23 | ------- 24 | 25 | .. class:: csvmonkey::StreamCursor 26 | 27 | Abstract base for implementing iteration over an input stream. 28 | :class:`StreamCursor` is the interface used by :class:`CsvReader` to 29 | acquire and manage its input buffer. 30 | 31 | .. function:: virtual const char \*buf() = 0 32 | 33 | Current stream position. Must guarantee access to 34 | `buf()[0..size()+31]`, with 31 trailing NULs to allow safely running 35 | ``PCMPSTRI`` on the final data byte. 36 | 37 | .. function:: virtual size_t size() = 0 38 | 39 | Size of the buffer pointed to by :func:`buf`. 40 | 41 | .. function:: virtual void consume(size_t n) = 0 42 | 43 | Called by :class:`CsvReader` to indicate `n` bytes from the front of 44 | :func:`buf` have been consumed. The return value of :func:`buf` and 45 | :func:`size` should now reflect a buffer positioned on the first byte 46 | following `n`. 47 | 48 | .. function:: virtual bool fill() = 0 49 | 50 | Called by :class:`CsvReader` to request more input. This function 51 | returns `true` to indicate the buffer provided by :func:`buf` and 52 | :func:`size` has been extended, or `false` to indicate EOF. 53 | 54 | 55 | .. class:: csvmonkey::MappedFileCursor : public StreamCursor 56 | 57 | Implement zero-copy input using a memory-mapped file. 58 | 59 | .. function:: void open(const char \*filename) 60 | 61 | Open `filename` for reading. Throws :class:`Error` on failure. 62 | 63 | 64 | .. class:: csvmonkey::BufferedStreamCursor : public StreamCursor 65 | 66 | Base class for any cursor implementation that requires buffering. 67 | 68 | .. member:: std::vector vec_ 69 | 70 | The buffer 71 | 72 | .. member:: size_t write_pos_ 73 | 74 | Current write offset within the buffer. New data appended to 75 | :member:`vec_` by :func:`readmore` should append past `write_pos_`. 76 | 77 | .. function:: void ensure(size_t capacity) 78 | 79 | Ensure at least `capacity` additional bytes are available in the buffer 80 | starting at the current write position. 81 | 82 | .. function:: virtual ssize_t readmore() = 0 83 | 84 | Arrange for more data to fill the buffer. Your implementation should 85 | issue some IO request and copy the result into `vec_[write_pos_:]`. 86 | The function should return -1 on error, 0 on EOF, or nonzero to 87 | indicate how many bytes were appended. 88 | 89 | 90 | .. class:: csvmonkey::FdStreamCursor : public BufferedStreamCursor 91 | 92 | Implement buffered input from a UNIX file descriptor. 93 | 94 | .. function:: FdStreamCursor(int fd) 95 | 96 | Construct a new instance using `fd`. 97 | 98 | 99 | CsvCell 100 | ------- 101 | 102 | .. class:: csvmonkey::CsvCell 103 | 104 | Descriptor for a single parsed CSV field. 105 | 106 | Cells describe fields in terms of references to :func:`StreamCursor::buf`, 107 | and thus become invalid once the underlying stream cursor is mutated. 108 | :class:`CsvReader` reuses a single vector of cells throughout the run, 109 | therefore any cell returned after a successful :func:`CsvReader::parse_row` 110 | call are invalidated by the next call to :func:`CsvReader::parse_row`. 111 | 112 | .. member:: const char \*ptr 113 | 114 | Pointer to the start of the CSV field. 115 | 116 | .. member:: size_t size 117 | 118 | Size of the CSV field. 119 | 120 | .. member:: char escapechar 121 | 122 | Escape character configured for the :class:`CsvReader`. 123 | 124 | .. member:: char quotechar 125 | 126 | Quote character configured for the :class:`CsvReader`. 127 | 128 | .. member:: bool escaped 129 | 130 | If `true`, at least one escape character exists in the field. Its 131 | value must be accessed via :func:`CsvCell::as_str`. 132 | 133 | .. function:: std::string as_str() 134 | 135 | Return a string with the any quote and escapes decoded. 136 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | 2 | Table of Contents 3 | ================== 4 | 5 | 6 | .. toctree:: 7 | 8 | python 9 | cpp 10 | -------------------------------------------------------------------------------- /docs/python.rst: -------------------------------------------------------------------------------- 1 | 2 | Python API 3 | ========== 4 | 5 | 6 | 7 | Factory Functions 8 | ----------------- 9 | 10 | 11 | Common Arguments 12 | ^^^^^^^^^^^^^^^^ 13 | 14 | * **yields**: Specify the kind of value returned during iteration. 15 | 16 | * `row`: Cause a :class:`Row` to be yielded. Rows are dict/sequence-like 17 | objects that support lazy decoding. 18 | * `list`: Cause a fully decoded list to be yielded. 19 | * `tuple`: Cause a fully decoded tuple to be yielded. 20 | * `dict`: Cause a fully decoded dict to be yielded, in the style of 21 | :class:`csv.DictReader`. 22 | 23 | * **header**: Specify whether a header row exists, or specifies an explicit set 24 | of column names. The header row is used to form keys available via the 25 | :class:`Row` object, or for constructing dicts. May be any of: 26 | 27 | * :data:`True`: a header row exists and should be read away during 28 | construction. 29 | * :data:`False`: no header row exists. 30 | 31 | 32 | :param: header 33 | Foo bar baz. 34 | :param: delimiter 35 | Foo bar baz. 36 | :param: quotechar 37 | Foo bar baz. 38 | :param: escapechar 39 | Foo bar baz. 40 | :param bool: yield_incomplete_row 41 | Foo bar baz. 42 | :param: encoding 43 | Name of the encoding 44 | :param str: errors 45 | One of "strict", "ignore" or "replace". 46 | 47 | 48 | 49 | 50 | .. function:: from_iter 51 | 52 | 53 | 54 | .. function:: from_file 55 | .. function:: from_path 56 | 57 | -------------------------------------------------------------------------------- /docs/static/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dw/csvmonkey/dc621253348e3cb353c3641dfbf2193c276f6dfe/docs/static/.empty -------------------------------------------------------------------------------- /docs/templates/github.html: -------------------------------------------------------------------------------- 1 |

2 |
3 | Star 4 |

5 | -------------------------------------------------------------------------------- /docs/templates/globaltoc.html: -------------------------------------------------------------------------------- 1 | {{ toctree() }} 2 | -------------------------------------------------------------------------------- /docs/templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | {% set css_files = css_files + ['_static/style.css'] %} 3 | 4 | {# We don't support Sphinx search, so don't let its JS either. #} 5 | {% block scripts %} 6 | {% endblock %} 7 | 8 | {# Alabaster ships a completely useless custom.css, suppress it. #} 9 | {%- block extrahead %} 10 | 11 | 12 | {% endblock %} 13 | 14 | {% block footer %} 15 | {{ super() }} 16 | 17 | 26 | 27 | 33 | 34 | 35 | {% endblock %} 36 | -------------------------------------------------------------------------------- /docs/templates/piwik-config.js: -------------------------------------------------------------------------------- 1 | window._paq = []; 2 | window._paq.push(['trackPageView']); 3 | window._paq.push(['enableLinkTracking']); 4 | window._paq.push(['enableHeartBeatTimer', 30]); 5 | window._paq.push(['setSiteId', 6]); 6 | -------------------------------------------------------------------------------- /include/csvmonkey.hpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #if defined(__SSE4_2__) && !defined(CSM_IGNORE_SSE42) 19 | #define CSM_USE_SSE42 20 | #include 21 | #include 22 | #endif // __SSE4_2__ 23 | 24 | #ifdef USE_SPIRIT 25 | #include "boost/spirit/include/qi.hpp" 26 | #endif 27 | 28 | 29 | #ifdef CSVMONKEY_DEBUG 30 | # define CSM_DEBUG(x, ...) fprintf(stderr, "csvmonkey: " x "\n", ##__VA_ARGS__); 31 | #else 32 | # define CSM_DEBUG(x...) {} 33 | #endif 34 | 35 | 36 | namespace csvmonkey { 37 | 38 | 39 | class StreamCursor; 40 | 41 | 42 | template 43 | class CsvReader; 44 | 45 | 46 | class Error : public std::exception 47 | { 48 | std::string s_; 49 | 50 | public: 51 | Error(const char *category, const std::string &s) 52 | : s_(category) 53 | { 54 | s_.append(": "); 55 | s_.append(s); 56 | } 57 | 58 | virtual const char * 59 | what() const throw() 60 | { 61 | return s_.c_str(); 62 | } 63 | }; 64 | 65 | 66 | class StreamCursor 67 | { 68 | public: 69 | /** 70 | * Current stream position. Must guarantee access to buf()[0..size()+15], 71 | * with 31 trailing NULs to allow safely running PCMPSTRI on the final data 72 | * byte. 73 | */ 74 | virtual const char *buf() = 0; 75 | virtual size_t size() = 0; 76 | virtual void consume(size_t n) = 0; 77 | virtual bool fill() = 0; 78 | }; 79 | 80 | 81 | class MappedFileCursor 82 | : public StreamCursor 83 | { 84 | char *startp_; 85 | char *endp_; 86 | char *p_; 87 | char *guardp_; 88 | 89 | size_t get_page_size() 90 | { 91 | return (size_t) sysconf(_SC_PAGESIZE); 92 | } 93 | 94 | public: 95 | MappedFileCursor() 96 | : startp_(0) 97 | , endp_(0) 98 | , p_(0) 99 | , guardp_(0) 100 | { 101 | } 102 | 103 | ~MappedFileCursor() 104 | { 105 | if(startp_) { 106 | ::munmap(startp_, endp_ - startp_); 107 | } 108 | if(guardp_) { 109 | ::munmap(guardp_, get_page_size()); 110 | } 111 | } 112 | 113 | const char *buf() 114 | { 115 | return p_; 116 | } 117 | 118 | size_t size() 119 | { 120 | return endp_ - p_; 121 | } 122 | 123 | void consume(size_t n) 124 | { 125 | p_ += std::min(n, (size_t) (endp_ - p_)); 126 | CSM_DEBUG("consume(%lu); new size: %lu", n, size()) 127 | } 128 | 129 | bool fill() 130 | { 131 | return false; 132 | } 133 | 134 | void open(const char *filename) 135 | { 136 | int fd = ::open(filename, O_RDONLY); 137 | if(fd == -1) { 138 | throw Error(filename, strerror(errno)); 139 | } 140 | 141 | struct stat st; 142 | if(fstat(fd, &st) == -1) { 143 | ::close(fd); 144 | throw Error("fstat", strerror(errno)); 145 | } 146 | 147 | // UNIX sucks. We can't use MAP_FIXED to ensure a guard page appears 148 | // after the file data because it'll silently overwrite mappings for 149 | // unrelated stuff in RAM (causing bizarro unrelated errors and 150 | // segfaults). We can't rely on the kernel's map placement behaviour 151 | // because it varies depending on the size of the mapping (guard page 152 | // ends up sandwiched between .so mappings, data file ends up at bottom 153 | // of range with no space left before first .so). We can't parse 154 | // /proc/self/maps because that sucks and is nonportable and racy. We 155 | // could use random addresses pumped into posix_mem_offset() but that 156 | // is insane and likely slow and non-portable and racy. 157 | // 158 | // So that leaves us with: make a MAP_ANON mapping the size of the 159 | // datafile + the guard page, leaving the kernel to pick addresses, 160 | // then use MAP_FIXED to overwrite it. We can't avoid the MAP_FIXED 161 | // since there would otherwise be a race between the time we 162 | // mmap/munmap to find a usable address range, and another thread 163 | // performing the same operation. So here we exploit crap UNIX 164 | // semantics to avoid a race. 165 | 166 | unsigned long page_size = get_page_size(); 167 | unsigned long page_mask = page_size - 1; 168 | size_t rounded = (st.st_size & page_mask) 169 | ? ((st.st_size & ~page_mask) + page_size) 170 | : st.st_size; 171 | 172 | auto startp = (char *) mmap(0, rounded+page_size, PROT_READ, 173 | MAP_ANON|MAP_PRIVATE, 0, 0); 174 | if(! startp) { 175 | ::close(fd); 176 | throw Error("mmap", "could not allocate guard page"); 177 | } 178 | 179 | guardp_ = startp + rounded; 180 | startp_ = (char *) mmap(startp, st.st_size, PROT_READ, 181 | MAP_SHARED|MAP_FIXED, fd, 0); 182 | ::close(fd); 183 | 184 | if(startp_ != startp) { 185 | CSM_DEBUG("could not place data below guard page (%p) at %p, got %p.", 186 | guardp_, startp, startp_); 187 | throw Error("mmap", "could not place data below guard page"); 188 | } 189 | 190 | ::madvise(startp_, st.st_size, MADV_SEQUENTIAL); 191 | ::madvise(startp_, st.st_size, MADV_WILLNEED); 192 | endp_ = startp_ + st.st_size; 193 | p_ = startp_; 194 | } 195 | }; 196 | 197 | 198 | class BufferedStreamCursor 199 | : public StreamCursor 200 | { 201 | protected: 202 | std::vector vec_; 203 | size_t read_pos_; 204 | size_t write_pos_; 205 | 206 | virtual ssize_t readmore() = 0; 207 | 208 | BufferedStreamCursor() 209 | : vec_(131072) 210 | , read_pos_(0) 211 | , write_pos_(0) 212 | { 213 | } 214 | 215 | protected: 216 | void ensure(size_t capacity) 217 | { 218 | size_t available = vec_.size() - write_pos_; 219 | if(available < capacity) { 220 | CSM_DEBUG("resizing vec_ %lu", (size_t)(vec_.size() + capacity)); 221 | vec_.resize(32 + (vec_.size() + capacity)); 222 | } 223 | } 224 | 225 | public: 226 | const char *buf() 227 | { 228 | return &vec_[0] + read_pos_; 229 | } 230 | 231 | size_t size() 232 | { 233 | return write_pos_ - read_pos_; 234 | } 235 | 236 | void consume(size_t n) 237 | { 238 | read_pos_ += std::min(n, write_pos_ - read_pos_); 239 | CSM_DEBUG("consume(%lu); new size: %lu", n, size()) 240 | } 241 | 242 | virtual bool fill() 243 | { 244 | if(read_pos_) { 245 | size_t n = write_pos_ - read_pos_; 246 | CSM_DEBUG("read_pos_ needs adjust, it is %lu / n = %lu", read_pos_, n); 247 | memcpy(&vec_[0], &vec_[read_pos_], n); 248 | CSM_DEBUG("fill() adjust old write_pos = %lu", write_pos_); 249 | write_pos_ -= read_pos_; 250 | read_pos_ = 0; 251 | CSM_DEBUG("fill() adjust new write_pos = %lu", write_pos_); 252 | } 253 | 254 | if(write_pos_ == vec_.size()) { 255 | ensure(vec_.size() / 2); 256 | } 257 | 258 | ssize_t rc = readmore(); 259 | if(rc == -1) { 260 | CSM_DEBUG("readmore() failed"); 261 | return false; 262 | } 263 | 264 | CSM_DEBUG("readmore() succeeded") 265 | CSM_DEBUG("fill() old write_pos = %lu", write_pos_); 266 | write_pos_ += rc; 267 | CSM_DEBUG("fill() new write_pos = %lu", write_pos_); 268 | return write_pos_ > 0; 269 | } 270 | }; 271 | 272 | 273 | class FdStreamCursor 274 | : public BufferedStreamCursor 275 | { 276 | int fd_; 277 | 278 | public: 279 | FdStreamCursor(int fd) 280 | : BufferedStreamCursor() 281 | , fd_(fd) 282 | { 283 | } 284 | 285 | virtual ssize_t readmore() 286 | { 287 | return ::read(fd_, &vec_[write_pos_], vec_.size() - write_pos_); 288 | } 289 | }; 290 | 291 | 292 | struct CsvCell 293 | { 294 | const char *ptr; 295 | size_t size; 296 | 297 | char escapechar; 298 | char quotechar; 299 | bool escaped; 300 | 301 | std::string as_str() 302 | { 303 | auto s = std::string(ptr, size); 304 | if(escaped) { 305 | int o = 0; 306 | for(size_t i = 0; i < s.size();) { 307 | char c = s[i]; 308 | if((escapechar && c == escapechar) || (c == quotechar)) { 309 | i++; 310 | } 311 | s[o++] = s[i++]; 312 | } 313 | s.resize(o); 314 | } 315 | return s; 316 | } 317 | 318 | bool startswith(const char *str) const 319 | { 320 | return std::string(ptr, std::min(size, strlen(str))) == str; 321 | } 322 | 323 | bool equals(const char *str) const 324 | { 325 | auto p = ptr; 326 | for(auto len = size; len--;) { 327 | if((! *p) || *str++ != *p++) { 328 | return false; 329 | } 330 | } 331 | return true; 332 | } 333 | 334 | double as_double() 335 | { 336 | #ifdef USE_SPIRIT 337 | namespace qi = boost::spirit::qi; 338 | using qi::double_; 339 | double n; 340 | qi::parse(ptr, ptr+size, double_, n); 341 | return n; 342 | #else 343 | return strtod(ptr, NULL); 344 | #endif 345 | } 346 | }; 347 | 348 | 349 | struct FieldPair 350 | { 351 | const char *name; 352 | CsvCell **cell; 353 | }; 354 | 355 | 356 | #ifndef CSM_USE_SSE42 357 | #warning Using non-SSE4.2 fallback implementation. 358 | /** 359 | * Callable that matches a set of up to 5 bytes (including NUL) in a 16 byte 360 | * string. The index 0..15 of the first occurrence is returned, otherwise 16 is 361 | * returned if no match is found or NUL is encountered. 362 | */ 363 | struct StringSpannerFallback 364 | { 365 | uint8_t charset_[256]; 366 | 367 | StringSpannerFallback(char c1=0, char c2=0, char c3=0, char c4=0) 368 | { 369 | ::memset(charset_, 0, sizeof charset_); 370 | charset_[(unsigned) c1] = 1; 371 | charset_[(unsigned) c2] = 1; 372 | charset_[(unsigned) c3] = 1; 373 | charset_[(unsigned) c4] = 1; 374 | charset_[0] = 1; 375 | } 376 | 377 | size_t 378 | operator()(const char *s) 379 | __attribute__((__always_inline__)) 380 | { 381 | CSM_DEBUG("bitfield[32] = %d", charset_[32]); 382 | CSM_DEBUG("span[0] = {%d,%d,%d,%d,%d,%d,%d,%d}", 383 | s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]); 384 | CSM_DEBUG("span[1] = {%d,%d,%d,%d,%d,%d,%d,%d}", 385 | s[8], s[9], s[10], s[11], s[12], s[13], s[14], s[15]); 386 | 387 | auto p = (const unsigned char *)s; 388 | auto e = p + 16; 389 | 390 | do { 391 | if(charset_[p[0]]) { 392 | break; 393 | } 394 | if(charset_[p[1]]) { 395 | p++; 396 | break; 397 | } 398 | if(charset_[p[2]]) { 399 | p += 2; 400 | break; 401 | } 402 | if(charset_[p[3]]) { 403 | p += 3; 404 | break; 405 | } 406 | p += 4; 407 | } while(p < e); 408 | 409 | if(! *p) { 410 | return 16; // PCMPISTRI reports NUL encountered as no match. 411 | } 412 | 413 | return p - (const unsigned char *)s; 414 | } 415 | }; 416 | 417 | using StringSpanner = StringSpannerFallback; 418 | # define CSM_ATTR_SSE42 419 | #endif // !CSM_USE_SSE42 420 | 421 | 422 | #ifdef CSM_USE_SSE42 423 | struct alignas(16) StringSpannerSse42 424 | { 425 | __m128i v_; 426 | 427 | StringSpannerSse42(char c1=0, char c2=0, char c3=0, char c4=0) 428 | { 429 | assert(! ((reinterpret_cast(&v_) & 15))); 430 | __v16qi vq = {c1, c2, c3, c4}; 431 | v_ = (__m128i) vq; 432 | } 433 | 434 | size_t __attribute__((__always_inline__, target("sse4.2"))) 435 | operator()(const char *buf) 436 | { 437 | return _mm_cmpistri( 438 | v_, 439 | _mm_loadu_si128((__m128i *) buf), 440 | 0 441 | ); 442 | } 443 | }; 444 | 445 | using StringSpanner = StringSpannerSse42; 446 | # define CSM_ATTR_SSE42 __attribute__((target("sse4.2"))) 447 | #endif // CSM_USE_SSE42 448 | 449 | 450 | class CsvCursor 451 | { 452 | public: 453 | std::vector cells; 454 | size_t count; 455 | 456 | CsvCursor() 457 | : cells() 458 | , count(0) 459 | { 460 | } 461 | 462 | bool 463 | by_value(const std::string &value, CsvCell *&cell) 464 | { 465 | for(size_t i = 0; i < count; i++) { 466 | if(value == cells[i].as_str()) { 467 | cell = &cells[i]; 468 | return true; 469 | } 470 | } 471 | return false; 472 | } 473 | }; 474 | 475 | 476 | template 477 | class alignas(16) CsvReader 478 | { 479 | const char *endp_; 480 | const char *p_; 481 | char delimiter_; 482 | char quotechar_; 483 | char escapechar_; 484 | bool yield_incomplete_row_; 485 | 486 | public: 487 | bool in_newline_skip; 488 | 489 | private: 490 | StreamCursorType &stream_; 491 | StringSpanner quoted_cell_spanner_; 492 | StringSpanner unquoted_cell_spanner_; 493 | CsvCursor row_; 494 | 495 | enum CsmTryParseReturnType { 496 | kCsmTryParseOkay, 497 | kCsmTryParseOverflow, 498 | kCsmTryParseUnderrun 499 | }; 500 | 501 | CsmTryParseReturnType 502 | try_parse() 503 | CSM_ATTR_SSE42 504 | { 505 | const char *p = p_; 506 | const char *cell_start; 507 | int rc, rc2; 508 | 509 | CsvCell *cell = &row_.cells[0]; 510 | row_.count = 0; 511 | 512 | #define PREAMBLE() \ 513 | if(p >= endp_) {\ 514 | CSM_DEBUG("pos exceeds size"); \ 515 | return kCsmTryParseUnderrun; \ 516 | } \ 517 | CSM_DEBUG("p = %#p; remain = %ld; next char is: %d", p, endp_-p, (int)*p) \ 518 | CSM_DEBUG("%d: distance to next newline: %d", __LINE__, strchr(p, '\n') - p); 519 | 520 | #define NEXT_CELL() \ 521 | ++cell; \ 522 | if(row_.count == row_.cells.size()) { \ 523 | CSM_DEBUG("cell array overflow"); \ 524 | return kCsmTryParseOverflow; \ 525 | } 526 | 527 | CSM_DEBUG("remain = %lu", endp_ - p); 528 | CSM_DEBUG("ch = %d %c", (int) *p, *p); 529 | 530 | newline_skip: 531 | /* 532 | * Skip newlines appearing at the start of the line, which may be a 533 | * result of DOS/MAC-formatted input. Or a double-spaced CSV file. 534 | */ 535 | in_newline_skip = true; 536 | PREAMBLE() 537 | if(*p == '\r' || *p == '\n') { 538 | ++p; 539 | goto newline_skip; 540 | } 541 | 542 | cell_start: 543 | in_newline_skip = false; 544 | PREAMBLE() 545 | cell->escaped = false; 546 | if(*p == '\r' || *p == '\n') { 547 | /* 548 | * A newline appearing after at least one cell has been read 549 | * indicates the presence of a single comma demarcating an unquoted 550 | * unquoted unquoted unquoted empty final field. 551 | */ 552 | cell->ptr = 0; 553 | cell->size = 0; 554 | ++row_.count; 555 | p_ = p + 1; 556 | return kCsmTryParseOkay; 557 | } else if(*p == quotechar_) { 558 | cell_start = ++p; 559 | goto in_quoted_cell; 560 | } else { 561 | cell_start = p; 562 | goto in_unquoted_cell; 563 | } 564 | 565 | in_quoted_cell: 566 | PREAMBLE() 567 | rc = quoted_cell_spanner_(p); 568 | rc2 = quoted_cell_spanner_(p+16); 569 | if(rc != 16) { 570 | p += rc + 1; 571 | goto in_escape_or_end_of_quoted_cell; 572 | } 573 | 574 | switch(rc2) { 575 | case 16: 576 | p += 32; 577 | goto in_quoted_cell; 578 | default: 579 | p += rc2 + 1 + 16; 580 | goto in_escape_or_end_of_quoted_cell; 581 | } 582 | 583 | in_escape_or_end_of_quoted_cell: 584 | PREAMBLE() 585 | if(*p == delimiter_) { 586 | cell->ptr = cell_start; 587 | cell->size = p - cell_start - 1; 588 | ++row_.count; 589 | NEXT_CELL(); 590 | ++p; 591 | goto cell_start; 592 | } else if(*p == '\r' || *p == '\n') { 593 | cell->ptr = cell_start; 594 | cell->size = p - cell_start - 1; 595 | ++row_.count; 596 | p_ = p + 1; 597 | return kCsmTryParseOkay; 598 | } else { 599 | cell->escaped = true; 600 | ++p; 601 | goto in_quoted_cell; 602 | } 603 | 604 | in_unquoted_cell: 605 | CSM_DEBUG("\n\nin_unquoted_cell") 606 | PREAMBLE() 607 | rc = unquoted_cell_spanner_(p); 608 | CSM_DEBUG("unquoted span: %d; p[3]=%d p[..17]='%.17s'", rc, p[3], p); 609 | rc2 = unquoted_cell_spanner_(p+16); 610 | if(rc != 16) { 611 | p += rc; 612 | goto in_escape_or_end_of_unquoted_cell; 613 | } 614 | 615 | switch(rc2) { 616 | case 16: 617 | p += 32; 618 | goto in_unquoted_cell; 619 | default: 620 | p += rc2 + 16; 621 | goto in_escape_or_end_of_unquoted_cell; 622 | } 623 | 624 | in_escape_or_end_of_unquoted_cell: 625 | PREAMBLE() 626 | if(*p == delimiter_) { 627 | cell->ptr = cell_start; 628 | cell->size = p - cell_start; 629 | ++row_.count; 630 | CSM_DEBUG("in_escape_or_end_of_unquoted_cell(DELIMITER)") 631 | CSM_DEBUG("p[..17] = '%.17s'", p) 632 | CSM_DEBUG("done cell: '%.*s'", (int)cell->size, cell->ptr) 633 | NEXT_CELL(); 634 | ++p; 635 | goto cell_start; 636 | } else if(*p == '\r' || *p == '\n') { 637 | CSM_DEBUG("in_escape_or_end_of_unquoted_cell(NEWLINE)") 638 | cell->ptr = cell_start; 639 | cell->size = p - cell_start; 640 | ++row_.count; 641 | p_ = p + 1; 642 | return kCsmTryParseOkay; 643 | } else { 644 | cell->escaped = true; 645 | ++p; 646 | goto in_unquoted_cell; 647 | } 648 | 649 | CSM_DEBUG("error out"); 650 | return kCsmTryParseUnderrun; 651 | } 652 | 653 | #undef PREAMBLE 654 | #undef NEXT_CELL 655 | 656 | public: 657 | 658 | /** 659 | * Extract CsvCell pointers to fields with a particular value. Used as a 660 | * convenience for parsing the header row into a list of desired columns. 661 | * Throws csvmonkey::Error if a desired column is not found in the row. 662 | * 663 | * @example 664 | * CsvCell *resource_id; 665 | * CsvCell *item_description; 666 | * 667 | * if(! reader.read_row()) { 668 | * throw Error("cannot parse header row"); 669 | * } 670 | * 671 | * reader.extract_fields({ 672 | * {"ResourceId", &resource_id}, 673 | * {"ItemDescription", &item_description}, 674 | * }); 675 | */ 676 | void 677 | extract_fields(const std::vector &pairs) 678 | { 679 | for(const auto &pair : pairs) { 680 | if(! row_.by_value(pair.name, *pair.cell)) { 681 | std::string e("Could not find required header: "); 682 | e.append(pair.name); 683 | throw Error("extract_fields", e); 684 | } 685 | } 686 | } 687 | 688 | void 689 | _resize() 690 | { 691 | auto &cells = row_.cells; 692 | auto size = cells.size() * 2; 693 | if(! size) { 694 | size = 32; 695 | } 696 | 697 | cells.resize(size); 698 | // For as_str() 699 | for(size_t i = 0; i < size; i++) { 700 | CsvCell &cell = cells[i]; 701 | cell.quotechar = quotechar_; 702 | cell.escapechar = escapechar_; 703 | } 704 | } 705 | 706 | bool 707 | read_row() 708 | { 709 | const char *p; 710 | CSM_DEBUG("") 711 | 712 | do { 713 | p = stream_.buf(); 714 | p_ = p; 715 | endp_ = p + stream_.size(); 716 | switch(try_parse()) { 717 | case kCsmTryParseOkay: 718 | stream_.consume(p_ - p); 719 | return true; 720 | case kCsmTryParseOverflow: 721 | _resize(); 722 | return read_row(); 723 | case kCsmTryParseUnderrun: 724 | ; 725 | } 726 | CSM_DEBUG("attempting fill!") 727 | } while(stream_.fill()); 728 | 729 | if(row_.count && yield_incomplete_row_) { 730 | CSM_DEBUG("stream fill failed, but partial row exists") 731 | stream_.consume(endp_ - p); 732 | return true; 733 | } 734 | 735 | CSM_DEBUG("stream fill failed") 736 | return false; 737 | } 738 | 739 | CsvCursor & 740 | row() 741 | { 742 | return row_; 743 | } 744 | 745 | CsvReader(StreamCursorType &stream, 746 | char delimiter=',', 747 | char quotechar='"', 748 | char escapechar=0, 749 | bool yield_incomplete_row=false) 750 | : endp_(stream.buf() + stream.size()) 751 | , p_(stream.buf()) 752 | , delimiter_(delimiter) 753 | , quotechar_(quotechar) 754 | , escapechar_(escapechar) 755 | , yield_incomplete_row_(yield_incomplete_row) 756 | , stream_(stream) 757 | , quoted_cell_spanner_(quotechar, escapechar) 758 | , unquoted_cell_spanner_(delimiter, '\r', '\n', escapechar) 759 | { 760 | _resize(); 761 | } 762 | }; 763 | 764 | 765 | } // namespace csvmonkey 766 | -------------------------------------------------------------------------------- /scripts/calc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os.path 4 | import sys 5 | 6 | print (os.path.getsize('ram.csv') / 1048576.0) / (float(sys.argv[1]) / 1e6), 'GiB/s' 7 | -------------------------------------------------------------------------------- /scripts/compare.py: -------------------------------------------------------------------------------- 1 | 2 | try: 3 | from itertools import izip 4 | except ImportError: 5 | izip = zip 6 | 7 | import csv 8 | import csvmonkey 9 | 10 | 11 | icsv = csv.reader(open('ram.csv')) 12 | next(icsv) 13 | 14 | imonkey = csvmonkey.from_path('ram.csv', yields='list', header=True) 15 | 16 | for r1, r2 in izip(icsv, imonkey): 17 | print(r1) 18 | print(r2) 19 | assert r1 == r2 20 | -------------------------------------------------------------------------------- /scripts/csvcut.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import csv 5 | import operator 6 | import sys 7 | from itertools import chain 8 | 9 | import csvmonkey 10 | 11 | 12 | parser = argparse.ArgumentParser(description='Process some integers.') 13 | parser.add_argument('paths', nargs='*') 14 | parser.add_argument('-H', '--no-header', action='store_true', default=False) 15 | parser.add_argument('-f', '--fields', default='-') 16 | 17 | args = parser.parse_args() 18 | 19 | if args.fields == '-': 20 | slices = [slice(None)] 21 | else: 22 | slices = [] 23 | for bit in args.fields.split(','): 24 | left, sep, right = bit.partition('-') 25 | if sep: 26 | slices.append(slice(int(left) - 1, int(right))) 27 | else: 28 | slices.append(slice(int(left) -1, int(left))) 29 | 30 | ig = operator.itemgetter(*slices) 31 | 32 | 33 | writer = csv.writer(sys.stdout, quoting=csv.QUOTE_ALL) 34 | 35 | readers = [ 36 | csvmonkey.from_path(path, header=not args.no_header, yields='tuple') 37 | for path in args.paths 38 | ] 39 | if not readers: 40 | it = iter(sys.stdin.readline, '') 41 | readers.append(csvmonkey.from_iter(it, header=not args.no_header, yields='tuple')) 42 | 43 | 44 | for reader in readers: 45 | for row in reader: 46 | l = [] 47 | for sl in slices: 48 | l.extend(row[sl]) 49 | writer.writerow(l) 50 | -------------------------------------------------------------------------------- /scripts/dequote.py: -------------------------------------------------------------------------------- 1 | 2 | import csv 3 | 4 | writer = csv.writer(file('ram.noquotes-64mb.csv', 'w'), quoting=csv.QUOTE_NONE) 5 | for row in csv.reader(file('ram.64mb.csv')): 6 | writer.writerow([c.replace(',', '') for c in row]) 7 | -------------------------------------------------------------------------------- /scripts/makesum.py: -------------------------------------------------------------------------------- 1 | 2 | import hashlib 3 | import csv 4 | 5 | import csvmonkey 6 | 7 | reader = csv.reader(open('ram.csv')) 8 | h = hashlib.sha256() 9 | 10 | for row in reader: 11 | for col in row: 12 | h.update(col) 13 | 14 | assert h.hexdigest() == ( 15 | "68187f51a11392551209d440710d835cdc167e2150eccb34e8cf9192bb8f9fc6" 16 | ) 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function 3 | import os 4 | import sys 5 | 6 | from setuptools import Extension 7 | from setuptools import setup 8 | 9 | sys.path.insert(0, 10 | os.path.join( 11 | os.path.dirname(__file__), 12 | 'third_party', 13 | ) 14 | ) 15 | 16 | import cpuid 17 | 18 | 19 | def has_sse42(): 20 | cpu = cpuid.CPUID() 21 | regs = cpu(1) 22 | return bool((1 << 20) & regs[2]) 23 | 24 | 25 | extra_compile_args = [] 26 | extra_compile_args += ['-std=c++11'] 27 | extra_compile_args += ['-Iinclude'] 28 | extra_compile_args += ['-O3'] 29 | extra_compile_args += ['-w'] 30 | 31 | # cc1plus: warning: command line option '-Wstrict-prototypes' is valid for 32 | # C/ObjC but not for C++ 33 | extra_compile_args += ['-Wno-strict-prototypes'] 34 | 35 | #extra_compile_args += ['-DUSE_SPIRIT'] 36 | #extra_compile_args += ['-I/home/dmw/src/boost_1_64_0'] 37 | #extra_compile_args += ['-fprofile-generate', '-lgcov'] 38 | #extra_compile_args += ['-DCSVMONKEY_DEBUG'] 39 | 40 | 41 | if has_sse42(): 42 | extra_compile_args += ['-msse4.2'] 43 | else: 44 | print("Warning: CPU lacks SSE4.2, compiling with fallback", 45 | file=sys.stderr) 46 | 47 | 48 | setup( 49 | name='csvmonkey', 50 | author='David Wilson', 51 | author_email='dw+csvmonkey@botanicus.net', 52 | version='0.0.5', 53 | classifiers=[], 54 | url='https://github.com/dw/csvmonkey/', 55 | ext_modules = [ 56 | Extension( 57 | name='csvmonkey', 58 | sources=['cpython/csvmonkey.cpp'], 59 | undef_macros=['NDEBUG'], 60 | extra_compile_args=extra_compile_args, 61 | ) 62 | ], 63 | zip_safe = False, 64 | ) 65 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | CMakeCache.txt 2 | CMakeFiles 3 | Makefile 4 | cmake_install.cmake 5 | main 6 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.11) 2 | 3 | #SET_SOURCE_FILES_PROPERTIES( nosse_stringspanner_test.cpp PROPERTIES COMPILE_FLAGS -Wderp ) 4 | 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wunused -msse4.2") 6 | 7 | 8 | add_executable(main 9 | main.cpp 10 | sse42_stringspanner_test.cpp 11 | fallback_stringspanner_test.cpp 12 | ) 13 | 14 | set_property(TARGET main PROPERTY CXX_STANDARD 11) 15 | -------------------------------------------------------------------------------- /tests/_stringspanner_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "catch.hpp" 4 | #include "csvmonkey.hpp" 5 | 6 | 7 | TEST_CASE(PREFIX "initialNullTerminates", "[stringspanner]") 8 | { 9 | // PCMPISTRI returns 16 to indicate null encountered. 10 | const char *x = "\x00this,should,never,be,reached"; 11 | csvmonkey::StringSpanner ss(','); 12 | REQUIRE(ss(x) == 16); 13 | } 14 | 15 | 16 | TEST_CASE(PREFIX "midNullTerminates", "[stringspanner]") 17 | { 18 | // PCMPISTRI returns 16 to indicate null encountered. 19 | const char *x = "derp\x00this,should,never,be,reached"; 20 | csvmonkey::StringSpanner ss(','); 21 | REQUIRE(ss(x) == 16); 22 | } 23 | 24 | 25 | TEST_CASE(PREFIX "noMatchTerminates0", "[stringspanner]") 26 | { 27 | // Comma not found. 28 | const char *x = "derpderpderpderpderp"; 29 | csvmonkey::StringSpanner ss(','); 30 | REQUIRE(ss(x) == 16); 31 | } 32 | 33 | 34 | TEST_CASE(PREFIX "noMatchTerminates1", "[stringspanner]") 35 | { 36 | // No terminator specified. 37 | const char *x = "derpderpderpderpderp"; 38 | csvmonkey::StringSpanner ss; 39 | REQUIRE(ss(x) == 16); 40 | } 41 | 42 | 43 | TEST_CASE(PREFIX "matchAtEachOffset", "[stringspanner]") 44 | { 45 | const char *x = "derpderpderpderpderp"; 46 | for(int i = 0; i < 16; i++) { 47 | std::string s(x); 48 | s[i] = ','; 49 | INFO("i = " << i); 50 | csvmonkey::StringSpanner ss(','); 51 | REQUIRE(ss(s.c_str()) == i); 52 | } 53 | } 54 | 55 | 56 | TEST_CASE(PREFIX "matchPos16", "[stringspanner]") 57 | { 58 | const char *x = "derpderpderpderpderp"; 59 | std::string s(x); 60 | s[16] = ','; 61 | csvmonkey::StringSpanner ss(','); 62 | REQUIRE(ss(s.c_str()) == 16); 63 | } 64 | 65 | 66 | TEST_CASE(PREFIX "matchPos17", "[stringspanner]") 67 | { 68 | const char *x = "derpderpderpderpderp"; 69 | std::string s(x); 70 | s[17] = ','; 71 | csvmonkey::StringSpanner ss(','); 72 | REQUIRE(ss(s.c_str()) == 16); 73 | } 74 | -------------------------------------------------------------------------------- /tests/bench/iteration.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "csvmonkey.hpp" 7 | 8 | using csvmonkey::CsvCell; 9 | using csvmonkey::CsvCursor; 10 | using csvmonkey::CsvReader; 11 | using csvmonkey::MappedFileCursor; 12 | using std::chrono::duration_cast; 13 | using std::chrono::high_resolution_clock; 14 | using std::chrono::microseconds; 15 | 16 | 17 | static void 18 | die(const char *msg) 19 | { 20 | fprintf(stderr, "%s\n", msg); 21 | exit(1); 22 | } 23 | 24 | 25 | static int 26 | go(const char *path) 27 | { 28 | MappedFileCursor stream; 29 | CsvReader reader(stream); 30 | 31 | stream.open(path); 32 | CsvCursor &row = reader.row(); 33 | if(! reader.read_row()) { 34 | die("Cannot read header row"); 35 | } 36 | 37 | CsvCell *cost_cell; 38 | if((! row.by_value("Cost", cost_cell)) && 39 | (! row.by_value("UnBlendedCost", cost_cell))) { 40 | die("Cannot find Cost column"); 41 | } 42 | 43 | CsvCell *resource_id_cell; 44 | if(! row.by_value("ResourceId", resource_id_cell)) { 45 | die("Cannot find ResourceId column"); 46 | } 47 | 48 | CsvCell *record_type_cell; 49 | if(! row.by_value("RecordType", record_type_cell)) { 50 | die("Cannot find RecordType column"); 51 | } 52 | 53 | auto now = [&] { return high_resolution_clock::now(); }; 54 | double total = 0.0; 55 | auto start = now(); 56 | 57 | while(reader.read_row()) { 58 | if(0) { 59 | if(record_type_cell->equals("LineItem")) { 60 | total += cost_cell->as_double(); 61 | } else if(record_type_cell->equals("Rounding")) { 62 | total += cost_cell->as_double(); 63 | } 64 | } 65 | } 66 | auto finish = now(); 67 | 68 | printf("Total cost: %lf\n", total); 69 | auto usec = duration_cast(finish - start).count(); 70 | 71 | struct stat st; 72 | stat(path, &st); 73 | 74 | std::cout << usec << " us\n"; 75 | std::cout << (st.st_size / usec) << " bytes/us\n"; 76 | std::cout << ( 77 | (1e6 / (1024.0 * 1048576.0)) * (double) (st.st_size / usec) 78 | ) << " GiB/s\n"; 79 | return 0; 80 | } 81 | 82 | 83 | int main(int argc, char **argv) 84 | { 85 | const char *path = "ram.csv"; 86 | if(argc > 1) { 87 | path = argv[1]; 88 | } 89 | for(int i = 0 ; i < 5; i++) { 90 | go(path); 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /tests/csvmonkey_test.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | 4 | import csvmonkey 5 | 6 | 7 | EXAMPLE_FILE = """ 8 | c0,c1,c2,c3 9 | 0,1,2,3 10 | a,b,c,d 11 | """ 12 | 13 | 14 | def make_reader(s, **kwargs): 15 | return csvmonkey.from_iter(iter([s]), **kwargs) 16 | 17 | 18 | class ParseTest(unittest.TestCase): 19 | def test_bad_split(self): 20 | s = "2017-05-01T02:15:08.000Z 2 229340663981 eni-00589050 172.31.11.238 138.246.253.19 443 54503 6 1 44 1493604908 1493604966 ACCEPT OK\n" 21 | reader = make_reader(s, delimiter=' ', header=False) 22 | self.assertEquals(next(reader).astuple(), tuple(s.split())) 23 | 24 | 25 | 26 | class RowTest(unittest.TestCase): 27 | def reader(self): 28 | return make_reader(EXAMPLE_FILE) 29 | 30 | def test_getitem_numeric_positive(self): 31 | reader = self.reader() 32 | row = next(reader) 33 | self.assertEquals("0", row[0]) 34 | self.assertRaises(IndexError, lambda: row[5]) 35 | 36 | def test_getitem_numeric_negative(self): 37 | reader = self.reader() 38 | row = next(reader) 39 | self.assertEquals("3", row[-1]) 40 | self.assertRaises(IndexError, lambda: row[-5]) 41 | 42 | def test_getitem_key(self): 43 | reader = self.reader() 44 | row = next(reader) 45 | self.assertEquals("0", row["c0"]) 46 | self.assertRaises(KeyError, lambda: row["missing"]) 47 | 48 | 49 | 50 | 51 | if __name__ == '__main__': 52 | unittest.main() 53 | -------------------------------------------------------------------------------- /tests/data/anon-ram.csv.zstd: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8f94ae436de5dc0a8033f7fdd2c7348e538546a6799a20ea6a62705d220f2687 3 | size 2117190 4 | -------------------------------------------------------------------------------- /tests/fallback_stringspanner_test.cpp: -------------------------------------------------------------------------------- 1 | #define CSM_IGNORE_SSE42 2 | #define PREFIX "fallback_stringspanner_" 3 | #include "_stringspanner_test.cpp" 4 | -------------------------------------------------------------------------------- /tests/fullsum.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "csvmonkey.hpp" 4 | #include "picosha2.h" 5 | 6 | 7 | using namespace csvmonkey; 8 | 9 | 10 | int main(int argc, char **argv) 11 | { 12 | const char *path = "ram.csv"; 13 | if(argc > 1) { 14 | path = argv[1]; 15 | } 16 | 17 | MappedFileCursor stream; 18 | stream.open(path); 19 | CsvReader reader(stream); 20 | CsvCursor &row = reader.row(); 21 | 22 | picosha2::hash256_one_by_one hasher; 23 | while(reader.read_row()) { 24 | for(size_t i = 0; i < row.count; i++) { 25 | CsvCell &cell = row.cells[i]; 26 | std::string s = cell.as_str(); 27 | hasher.process(s.begin(), s.end()); 28 | } 29 | } 30 | 31 | hasher.finish(); 32 | std::cout << picosha2::get_hash_hex_string(hasher) << "\n"; 33 | return 0; 34 | } 35 | -------------------------------------------------------------------------------- /tests/main.cpp: -------------------------------------------------------------------------------- 1 | #define CATCH_CONFIG_MAIN 2 | #include "catch.hpp" 3 | -------------------------------------------------------------------------------- /tests/parser_test.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | import io 4 | 5 | import csvmonkey 6 | 7 | 8 | def parse(s): 9 | return list( 10 | csvmonkey.from_file( 11 | io.BytesIO(s), 12 | header=False, 13 | yields='tuple' 14 | ) 15 | ) 16 | 17 | 18 | class BoundaryTest(unittest.TestCase): 19 | def test_4094(self): 20 | # parsing ends on page boundary 21 | c = 'x' * 4094 22 | s = '%s,\n' % (c,) 23 | self.assertEquals([(c,'')], parse(s)) 24 | 25 | def test_4095(self): 26 | # parsing ends on first byte of new page 27 | c = 'x' * 4095 28 | s = '%s,\n' % (c,) 29 | self.assertEquals([(c,'')], parse(s)) 30 | 31 | def test_14(self): 32 | # parsing ends on 16th byte (SSE4.2) 33 | c = 'x' * 14 34 | s = '%s,\n' % (c,) 35 | self.assertEquals([(c,'')], parse(s)) 36 | 37 | def test_15(self): 38 | # parsing ends on 17th byte (SSE4.2) 39 | c = 'x' * 15 40 | s = '%s,\n' % (c,) 41 | self.assertEquals([(c,'')], parse(s)) 42 | 43 | 44 | class Test(unittest.TestCase): 45 | def test_empty0(self): 46 | self.assertEquals([], parse('')) 47 | 48 | def test_empty1(self): 49 | self.assertEquals([], parse('\n')) 50 | 51 | def test_empty2(self): 52 | self.assertEquals([], parse('\r\n')) 53 | 54 | def test_empty2(self): 55 | self.assertEquals([], parse('\r\n\n\r\r\r\n')) 56 | 57 | def test_unquoted_noeol(self): 58 | self.assertEquals([('a', 'b')], parse('a,b')) 59 | 60 | def test_unquoted_noeol2(self): 61 | self.skipTest('failing') 62 | self.assertEquals([('a', 'b'), ('c', 'd')], parse('a,b\n\rc,d')) 63 | 64 | def test_unquoted(self): 65 | self.assertEquals([('a', 'b')], parse('a,b\n')) 66 | 67 | def test_quoted_empty(self): 68 | self.assertEquals([('',)], parse('""\n')) 69 | 70 | def test_quoted_empty_unquoted(self): 71 | self.skipTest('failing') 72 | self.assertEquals([('', '')], parse('"",\n')) 73 | 74 | def test_unquoted_empty(self): 75 | self.skipTest('failing') 76 | self.assertEquals([('', '')], parse(',\n')) 77 | 78 | if __name__ == '__main__': 79 | unittest.main() 80 | -------------------------------------------------------------------------------- /tests/sse42_stringspanner_test.cpp: -------------------------------------------------------------------------------- 1 | #define PREFIX "sse42_stringspanner_" 2 | #include "_stringspanner_test.cpp" 3 | -------------------------------------------------------------------------------- /third_party/cpuid.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Copyright (c) 2014 Anders Høst 4 | # 5 | 6 | from __future__ import print_function 7 | 8 | import platform 9 | import os 10 | import sys 11 | import ctypes 12 | from ctypes import c_uint32, c_int, c_size_t, c_void_p, POINTER, CFUNCTYPE 13 | 14 | # Posix x86_64: 15 | # Two first call registers : RDI, RSI 16 | # Volatile registers : RAX, RCX, RDX, RSI, RDI, R8-11 17 | 18 | # Windows x86_64: 19 | # Two first call registers : RCX, RDX 20 | # Volatile registers : RAX, RCX, RDX, R8-11 21 | 22 | # cdecl 32 bit: 23 | # Two first call registers : Stack (%esp) 24 | # Volatile registers : EAX, ECX, EDX 25 | 26 | _POSIX_64_OPC = [ 27 | 0x53, # push %rbx 28 | 0x48, 0x89, 0xf0, # mov %rsi,%rax 29 | 0x31, 0xc9, # xor %ecx,%ecx 30 | 0x0f, 0xa2, # cpuid 31 | 0x89, 0x07, # mov %eax,(%rdi) 32 | 0x89, 0x5f, 0x04, # mov %ebx,0x4(%rdi) 33 | 0x89, 0x4f, 0x08, # mov %ecx,0x8(%rdi) 34 | 0x89, 0x57, 0x0c, # mov %edx,0xc(%rdi) 35 | 0x5b, # pop %rbx 36 | 0xc3 # retq 37 | ] 38 | 39 | _WINDOWS_64_OPC = [ 40 | 0x53, # push %rbx 41 | 0x48, 0x89, 0xd0, # mov %rdx,%rax 42 | 0x49, 0x89, 0xc8, # mov %rcx, %r8 43 | 0x31, 0xc9, # xor %ecx,%ecx 44 | 0x0f, 0xa2, # cpuid 45 | 0x41, 0x89, 0x00, # mov %eax,(%r8) 46 | 0x41, 0x89, 0x58, 0x04, # mov %ebx,0x4(%r8) 47 | 0x41, 0x89, 0x48, 0x08, # mov %ecx,0x8(%r8) 48 | 0x41, 0x89, 0x50, 0x0c, # mov %edx,0xc(%r8) 49 | 0x5b, # pop %rbx 50 | 0xc3 # retq 51 | ] 52 | 53 | _CDECL_32_OPC = [ 54 | 0x53, # push %ebx 55 | 0x57, # push %edi 56 | 0x8b, 0x7c, 0x24, 0x0c, # mov 0xc(%esp),%edi 57 | 0x8b, 0x44, 0x24, 0x10, # mov 0x10(%esp),%eax 58 | 0x31, 0xc9, # xor %ecx,%ecx 59 | 0x0f, 0xa2, # cpuid 60 | 0x89, 0x07, # mov %eax,(%edi) 61 | 0x89, 0x5f, 0x04, # mov %ebx,0x4(%edi) 62 | 0x89, 0x4f, 0x08, # mov %ecx,0x8(%edi) 63 | 0x89, 0x57, 0x0c, # mov %edx,0xc(%edi) 64 | 0x5f, # pop %edi 65 | 0x5b, # pop %ebx 66 | 0xc3 # ret 67 | ] 68 | 69 | is_windows = os.name == "nt" or sys.platform == "cygwin" 70 | is_64bit = ctypes.sizeof(ctypes.c_voidp) == 8 71 | 72 | class CPUID_struct(ctypes.Structure): 73 | _fields_ = [(r, c_uint32) for r in ("eax", "ebx", "ecx", "edx")] 74 | 75 | class CPUID(object): 76 | def __init__(self): 77 | if platform.machine() not in ("AMD64", "x86_64", "x86", "i686"): 78 | raise SystemError("Only available for x86") 79 | 80 | if is_windows: 81 | if is_64bit: 82 | # VirtualAlloc seems to fail under some weird 83 | # circumstances when ctypes.windll.kernel32 is 84 | # used under 64 bit Python. CDLL fixes this. 85 | self.win = ctypes.CDLL("kernel32.dll") 86 | opc = _WINDOWS_64_OPC 87 | else: 88 | # Here ctypes.windll.kernel32 is needed to get the 89 | # right DLL. Otherwise it will fail when running 90 | # 32 bit Python on 64 bit Windows. 91 | self.win = ctypes.windll.kernel32 92 | opc = _CDECL_32_OPC 93 | else: 94 | opc = _POSIX_64_OPC if is_64bit else _CDECL_32_OPC 95 | 96 | size = len(opc) 97 | code = (ctypes.c_ubyte * size)(*opc) 98 | 99 | self.r = CPUID_struct() 100 | 101 | if is_windows: 102 | self.addr = self.win.VirtualAlloc(None, size, 0x1000, 0x40) 103 | if not self.addr: 104 | raise MemoryError("Could not allocate RWX memory") 105 | else: 106 | self.libc = ctypes.cdll.LoadLibrary(None) 107 | self.libc.valloc.restype = ctypes.c_void_p 108 | self.libc.valloc.argtypes = [ctypes.c_size_t] 109 | self.addr = self.libc.valloc(size) 110 | if not self.addr: 111 | raise MemoryError("Could not allocate memory") 112 | 113 | self.libc.mprotect.restype = c_int 114 | self.libc.mprotect.argtypes = [c_void_p, c_size_t, c_int] 115 | ret = self.libc.mprotect(self.addr, size, 1 | 2 | 4) 116 | if ret != 0: 117 | raise OSError("Failed to set RWX") 118 | 119 | 120 | ctypes.memmove(self.addr, code, size) 121 | 122 | func_type = CFUNCTYPE(None, POINTER(CPUID_struct), c_uint32) 123 | self.func_ptr = func_type(self.addr) 124 | 125 | def __call__(self, eax): 126 | self.func_ptr(self.r, eax) 127 | return (self.r.eax, self.r.ebx, self.r.ecx, self.r.edx) 128 | 129 | def __del__(self): 130 | if is_windows: 131 | self.win.VirtualFree(self.addr, 0, 0x8000) 132 | elif self.libc: 133 | # Seems to throw exception when the program ends and 134 | # libc is cleaned up before the object? 135 | self.libc.free.restype = None 136 | self.libc.free.argtypes = [c_void_p] 137 | self.libc.free(self.addr) 138 | 139 | if __name__ == "__main__": 140 | def valid_inputs(): 141 | cpuid = CPUID() 142 | for eax in (0x0, 0x80000000): 143 | highest, _, _, _ = cpuid(eax) 144 | while eax <= highest: 145 | regs = cpuid(eax) 146 | yield (eax, regs) 147 | eax += 1 148 | 149 | print(" ".join(x.ljust(8) for x in ("CPUID", "A", "B", "C", "D")).strip()) 150 | for eax, regs in valid_inputs(): 151 | print("%08x" % eax, " ".join("%08x" % reg for reg in regs)) 152 | 153 | -------------------------------------------------------------------------------- /third_party/picosha2.h: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (C) 2017 okdshin 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | #ifndef PICOSHA2_H 25 | #define PICOSHA2_H 26 | // picosha2:20140213 27 | 28 | #ifndef PICOSHA2_BUFFER_SIZE_FOR_INPUT_ITERATOR 29 | #define PICOSHA2_BUFFER_SIZE_FOR_INPUT_ITERATOR \ 30 | 1048576 //=1024*1024: default is 1MB memory 31 | #endif 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | namespace picosha2 { 40 | typedef unsigned long word_t; 41 | typedef unsigned char byte_t; 42 | 43 | static const size_t k_digest_size = 32; 44 | 45 | namespace detail { 46 | inline byte_t mask_8bit(byte_t x) { return x & 0xff; } 47 | 48 | inline word_t mask_32bit(word_t x) { return x & 0xffffffff; } 49 | 50 | const word_t add_constant[64] = { 51 | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 52 | 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 53 | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 54 | 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 55 | 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 56 | 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 57 | 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 58 | 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 59 | 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 60 | 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 61 | 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2}; 62 | 63 | const word_t initial_message_digest[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 64 | 0xa54ff53a, 0x510e527f, 0x9b05688c, 65 | 0x1f83d9ab, 0x5be0cd19}; 66 | 67 | inline word_t ch(word_t x, word_t y, word_t z) { return (x & y) ^ ((~x) & z); } 68 | 69 | inline word_t maj(word_t x, word_t y, word_t z) { 70 | return (x & y) ^ (x & z) ^ (y & z); 71 | } 72 | 73 | inline word_t rotr(word_t x, std::size_t n) { 74 | assert(n < 32); 75 | return mask_32bit((x >> n) | (x << (32 - n))); 76 | } 77 | 78 | inline word_t bsig0(word_t x) { return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22); } 79 | 80 | inline word_t bsig1(word_t x) { return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25); } 81 | 82 | inline word_t shr(word_t x, std::size_t n) { 83 | assert(n < 32); 84 | return x >> n; 85 | } 86 | 87 | inline word_t ssig0(word_t x) { return rotr(x, 7) ^ rotr(x, 18) ^ shr(x, 3); } 88 | 89 | inline word_t ssig1(word_t x) { return rotr(x, 17) ^ rotr(x, 19) ^ shr(x, 10); } 90 | 91 | template 92 | void hash256_block(RaIter1 message_digest, RaIter2 first, RaIter2 last) { 93 | assert(first + 64 == last); 94 | static_cast(last); // for avoiding unused-variable warning 95 | word_t w[64]; 96 | std::fill(w, w + 64, 0); 97 | for (std::size_t i = 0; i < 16; ++i) { 98 | w[i] = (static_cast(mask_8bit(*(first + i * 4))) << 24) | 99 | (static_cast(mask_8bit(*(first + i * 4 + 1))) << 16) | 100 | (static_cast(mask_8bit(*(first + i * 4 + 2))) << 8) | 101 | (static_cast(mask_8bit(*(first + i * 4 + 3)))); 102 | } 103 | for (std::size_t i = 16; i < 64; ++i) { 104 | w[i] = mask_32bit(ssig1(w[i - 2]) + w[i - 7] + ssig0(w[i - 15]) + 105 | w[i - 16]); 106 | } 107 | 108 | word_t a = *message_digest; 109 | word_t b = *(message_digest + 1); 110 | word_t c = *(message_digest + 2); 111 | word_t d = *(message_digest + 3); 112 | word_t e = *(message_digest + 4); 113 | word_t f = *(message_digest + 5); 114 | word_t g = *(message_digest + 6); 115 | word_t h = *(message_digest + 7); 116 | 117 | for (std::size_t i = 0; i < 64; ++i) { 118 | word_t temp1 = h + bsig1(e) + ch(e, f, g) + add_constant[i] + w[i]; 119 | word_t temp2 = bsig0(a) + maj(a, b, c); 120 | h = g; 121 | g = f; 122 | f = e; 123 | e = mask_32bit(d + temp1); 124 | d = c; 125 | c = b; 126 | b = a; 127 | a = mask_32bit(temp1 + temp2); 128 | } 129 | *message_digest += a; 130 | *(message_digest + 1) += b; 131 | *(message_digest + 2) += c; 132 | *(message_digest + 3) += d; 133 | *(message_digest + 4) += e; 134 | *(message_digest + 5) += f; 135 | *(message_digest + 6) += g; 136 | *(message_digest + 7) += h; 137 | for (std::size_t i = 0; i < 8; ++i) { 138 | *(message_digest + i) = mask_32bit(*(message_digest + i)); 139 | } 140 | } 141 | 142 | } // namespace detail 143 | 144 | template 145 | void output_hex(InIter first, InIter last, std::ostream& os) { 146 | os.setf(std::ios::hex, std::ios::basefield); 147 | while (first != last) { 148 | os.width(2); 149 | os.fill('0'); 150 | os << static_cast(*first); 151 | ++first; 152 | } 153 | os.setf(std::ios::dec, std::ios::basefield); 154 | } 155 | 156 | template 157 | void bytes_to_hex_string(InIter first, InIter last, std::string& hex_str) { 158 | std::ostringstream oss; 159 | output_hex(first, last, oss); 160 | hex_str.assign(oss.str()); 161 | } 162 | 163 | template 164 | void bytes_to_hex_string(const InContainer& bytes, std::string& hex_str) { 165 | bytes_to_hex_string(bytes.begin(), bytes.end(), hex_str); 166 | } 167 | 168 | template 169 | std::string bytes_to_hex_string(InIter first, InIter last) { 170 | std::string hex_str; 171 | bytes_to_hex_string(first, last, hex_str); 172 | return hex_str; 173 | } 174 | 175 | template 176 | std::string bytes_to_hex_string(const InContainer& bytes) { 177 | std::string hex_str; 178 | bytes_to_hex_string(bytes, hex_str); 179 | return hex_str; 180 | } 181 | 182 | class hash256_one_by_one { 183 | public: 184 | hash256_one_by_one() { init(); } 185 | 186 | void init() { 187 | buffer_.clear(); 188 | std::fill(data_length_digits_, data_length_digits_ + 4, 0); 189 | std::copy(detail::initial_message_digest, 190 | detail::initial_message_digest + 8, h_); 191 | } 192 | 193 | template 194 | void process(RaIter first, RaIter last) { 195 | add_to_data_length(static_cast(std::distance(first, last))); 196 | std::copy(first, last, std::back_inserter(buffer_)); 197 | std::size_t i = 0; 198 | for (; i + 64 <= buffer_.size(); i += 64) { 199 | detail::hash256_block(h_, buffer_.begin() + i, 200 | buffer_.begin() + i + 64); 201 | } 202 | buffer_.erase(buffer_.begin(), buffer_.begin() + i); 203 | } 204 | 205 | void finish() { 206 | byte_t temp[64]; 207 | std::fill(temp, temp + 64, 0); 208 | std::size_t remains = buffer_.size(); 209 | std::copy(buffer_.begin(), buffer_.end(), temp); 210 | temp[remains] = 0x80; 211 | 212 | if (remains > 55) { 213 | std::fill(temp + remains + 1, temp + 64, 0); 214 | detail::hash256_block(h_, temp, temp + 64); 215 | std::fill(temp, temp + 64 - 4, 0); 216 | } else { 217 | std::fill(temp + remains + 1, temp + 64 - 4, 0); 218 | } 219 | 220 | write_data_bit_length(&(temp[56])); 221 | detail::hash256_block(h_, temp, temp + 64); 222 | } 223 | 224 | template 225 | void get_hash_bytes(OutIter first, OutIter last) const { 226 | for (const word_t* iter = h_; iter != h_ + 8; ++iter) { 227 | for (std::size_t i = 0; i < 4 && first != last; ++i) { 228 | *(first++) = detail::mask_8bit( 229 | static_cast((*iter >> (24 - 8 * i)))); 230 | } 231 | } 232 | } 233 | 234 | private: 235 | void add_to_data_length(word_t n) { 236 | word_t carry = 0; 237 | data_length_digits_[0] += n; 238 | for (std::size_t i = 0; i < 4; ++i) { 239 | data_length_digits_[i] += carry; 240 | if (data_length_digits_[i] >= 65536u) { 241 | carry = data_length_digits_[i] >> 16; 242 | data_length_digits_[i] &= 65535u; 243 | } else { 244 | break; 245 | } 246 | } 247 | } 248 | void write_data_bit_length(byte_t* begin) { 249 | word_t data_bit_length_digits[4]; 250 | std::copy(data_length_digits_, data_length_digits_ + 4, 251 | data_bit_length_digits); 252 | 253 | // convert byte length to bit length (multiply 8 or shift 3 times left) 254 | word_t carry = 0; 255 | for (std::size_t i = 0; i < 4; ++i) { 256 | word_t before_val = data_bit_length_digits[i]; 257 | data_bit_length_digits[i] <<= 3; 258 | data_bit_length_digits[i] |= carry; 259 | data_bit_length_digits[i] &= 65535u; 260 | carry = (before_val >> (16 - 3)) & 65535u; 261 | } 262 | 263 | // write data_bit_length 264 | for (int i = 3; i >= 0; --i) { 265 | (*begin++) = static_cast(data_bit_length_digits[i] >> 8); 266 | (*begin++) = static_cast(data_bit_length_digits[i]); 267 | } 268 | } 269 | std::vector buffer_; 270 | word_t data_length_digits_[4]; // as 64bit integer (16bit x 4 integer) 271 | word_t h_[8]; 272 | }; 273 | 274 | inline void get_hash_hex_string(const hash256_one_by_one& hasher, 275 | std::string& hex_str) { 276 | byte_t hash[k_digest_size]; 277 | hasher.get_hash_bytes(hash, hash + k_digest_size); 278 | return bytes_to_hex_string(hash, hash + k_digest_size, hex_str); 279 | } 280 | 281 | inline std::string get_hash_hex_string(const hash256_one_by_one& hasher) { 282 | std::string hex_str; 283 | get_hash_hex_string(hasher, hex_str); 284 | return hex_str; 285 | } 286 | 287 | namespace impl { 288 | template 289 | void hash256_impl(RaIter first, RaIter last, OutIter first2, OutIter last2, int, 290 | std::random_access_iterator_tag) { 291 | hash256_one_by_one hasher; 292 | // hasher.init(); 293 | hasher.process(first, last); 294 | hasher.finish(); 295 | hasher.get_hash_bytes(first2, last2); 296 | } 297 | 298 | template 299 | void hash256_impl(InputIter first, InputIter last, OutIter first2, 300 | OutIter last2, int buffer_size, std::input_iterator_tag) { 301 | std::vector buffer(buffer_size); 302 | hash256_one_by_one hasher; 303 | // hasher.init(); 304 | while (first != last) { 305 | int size = buffer_size; 306 | for (int i = 0; i != buffer_size; ++i, ++first) { 307 | if (first == last) { 308 | size = i; 309 | break; 310 | } 311 | buffer[i] = *first; 312 | } 313 | hasher.process(buffer.begin(), buffer.begin() + size); 314 | } 315 | hasher.finish(); 316 | hasher.get_hash_bytes(first2, last2); 317 | } 318 | } 319 | 320 | template 321 | void hash256(InIter first, InIter last, OutIter first2, OutIter last2, 322 | int buffer_size = PICOSHA2_BUFFER_SIZE_FOR_INPUT_ITERATOR) { 323 | picosha2::impl::hash256_impl( 324 | first, last, first2, last2, buffer_size, 325 | typename std::iterator_traits::iterator_category()); 326 | } 327 | 328 | template 329 | void hash256(InIter first, InIter last, OutContainer& dst) { 330 | hash256(first, last, dst.begin(), dst.end()); 331 | } 332 | 333 | template 334 | void hash256(const InContainer& src, OutIter first, OutIter last) { 335 | hash256(src.begin(), src.end(), first, last); 336 | } 337 | 338 | template 339 | void hash256(const InContainer& src, OutContainer& dst) { 340 | hash256(src.begin(), src.end(), dst.begin(), dst.end()); 341 | } 342 | 343 | template 344 | void hash256_hex_string(InIter first, InIter last, std::string& hex_str) { 345 | byte_t hashed[k_digest_size]; 346 | hash256(first, last, hashed, hashed + k_digest_size); 347 | std::ostringstream oss; 348 | output_hex(hashed, hashed + k_digest_size, oss); 349 | hex_str.assign(oss.str()); 350 | } 351 | 352 | template 353 | std::string hash256_hex_string(InIter first, InIter last) { 354 | std::string hex_str; 355 | hash256_hex_string(first, last, hex_str); 356 | return hex_str; 357 | } 358 | 359 | inline void hash256_hex_string(const std::string& src, std::string& hex_str) { 360 | hash256_hex_string(src.begin(), src.end(), hex_str); 361 | } 362 | 363 | template 364 | void hash256_hex_string(const InContainer& src, std::string& hex_str) { 365 | hash256_hex_string(src.begin(), src.end(), hex_str); 366 | } 367 | 368 | template 369 | std::string hash256_hex_string(const InContainer& src) { 370 | return hash256_hex_string(src.begin(), src.end()); 371 | } 372 | templatevoid hash256(std::ifstream& f, OutIter first, OutIter last){ 373 | hash256(std::istreambuf_iterator(f), std::istreambuf_iterator(), first,last); 374 | 375 | } 376 | }// namespace picosha2 377 | #endif // PICOSHA2_H 378 | --------------------------------------------------------------------------------