├── .clangd ├── .gitignore ├── CMakeLists.txt ├── Config.cmake.in ├── LICENSE ├── README.md ├── benchmark ├── README.md ├── cpp │ └── main.cpp ├── js │ ├── main.js │ ├── package-lock.json │ └── package.json ├── rust │ ├── Cargo.lock │ ├── Cargo.toml │ └── src │ │ └── main.rs └── sample.csv ├── parser.hpp └── test ├── .clangd ├── CMakeLists.txt ├── data ├── bom_empty.csv ├── bom_simple.csv ├── comma_in_quotes.csv ├── delimiter.csv ├── empty.csv ├── emptyUnquoted.csv ├── empty_crlf.csv ├── empty_file.csv ├── escaped_quotes.csv ├── json.csv ├── newlines.csv ├── newlines_crlf.csv ├── quote.csv ├── quotes_and_newlines.csv ├── simple.csv ├── simple_crlf.csv ├── terminator.csv └── utf8.csv └── parser_test.cpp /.clangd: -------------------------------------------------------------------------------- 1 | CompileFlags: 2 | Add: 3 | - -std=c++11 4 | - -Wall 5 | - -Wextra 6 | - -Werror 7 | - -pedantic 8 | - -I. 9 | - -Iinclude 10 | - -Isrc 11 | 12 | Diagnostics: 13 | ClangTidy: 14 | Add: 15 | - modernize* 16 | - performance* 17 | - bugprone* 18 | - readability* 19 | Remove: 20 | - readability-identifier-length 21 | - readability-function-cognitive-complexity 22 | 23 | Index: 24 | Background: Build 25 | 26 | InlayHints: 27 | Enabled: Yes 28 | ParameterNames: Yes 29 | DeducedTypes: Yes 30 | 31 | Style: 32 | FullyQualifiedNamespaces: No 33 | 34 | Hover: 35 | ShowAKA: Yes 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o* 2 | node_modules 3 | target 4 | out 5 | .DS_Store 6 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(AriaCsvParser) 2 | cmake_minimum_required(VERSION 3.14 FATAL_ERROR) 3 | include(GNUInstallDirs) 4 | include(CMakePackageConfigHelpers) 5 | 6 | add_library(${PROJECT_NAME} INTERFACE) 7 | 8 | set_target_properties(${PROJECT_NAME} PROPERTIES 9 | PUBLIC_HEADER parser.hpp 10 | ) 11 | 12 | target_include_directories( ${PROJECT_NAME} 13 | INTERFACE 14 | $ 15 | $ 16 | ) 17 | 18 | install(TARGETS ${PROJECT_NAME} 19 | EXPORT ${PROJECT_NAME}Targets) 20 | 21 | install(EXPORT ${PROJECT_NAME}Targets 22 | NAMESPACE AriaCsvParser:: 23 | FILE AriaCsvParserTargets.cmake 24 | DESTINATION lib/cmake/${PROJECT_NAME} 25 | ) 26 | 27 | configure_package_config_file(Config.cmake.in 28 | ${PROJECT_NAME}Config.cmake 29 | INSTALL_DESTINATION "lib/cmake/${PROJECT_NAME}" 30 | NO_SET_AND_CHECK_MACRO 31 | NO_CHECK_REQUIRED_COMPONENTS_MACRO 32 | ) 33 | 34 | install(FILES 35 | ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake 36 | DESTINATION lib/cmake/${PROJECT_NAME} 37 | ) -------------------------------------------------------------------------------- /Config.cmake.in: -------------------------------------------------------------------------------- 1 | @PACKAGE_INIT@ 2 | 3 | include ( "${CMAKE_CURRENT_LIST_DIR}/AriaCsvParserTargets.cmake" ) 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Aria Fallah 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CSV Parser 2 | 3 | Fast, simple, header-only, C++11 CSV parser. 4 | 5 | ## Usage 6 | 7 | #### Configuration 8 | 9 | You initialize the parser by passing it any input stream of characters. For 10 | example, you can read from a file 11 | 12 | ```cpp 13 | std::ifstream f("some_file.csv"); 14 | CsvParser parser(f); 15 | ``` 16 | 17 | or you can read from `stdin` 18 | 19 | ```cpp 20 | CsvParser parser(std::cin); 21 | ``` 22 | 23 | Moreover, you can configure the parser by chaining configuration methods like 24 | 25 | ```cpp 26 | CsvParser parser = CsvParser(std::cin) 27 | .delimiter(';') // delimited by ; instead of , 28 | .quote('\'') // quoted fields use ' instead of " 29 | .terminator('\0'); // terminated by \0 instead of by \r\n, \n, or \r 30 | ``` 31 | 32 | #### Parsing 33 | 34 | You can read from the CSV using a range based for loop. Each row of the CSV is 35 | represented as a `std::vector`. 36 | 37 | ```cpp 38 | #include 39 | #include "../parser.hpp" 40 | 41 | using namespace aria::csv; 42 | 43 | int main() { 44 | std::ifstream f("some_file.csv"); 45 | CsvParser parser(f); 46 | 47 | for (auto& row : parser) { 48 | for (auto& field : row) { 49 | std::cout << field << " | "; 50 | } 51 | std::cout << std::endl; 52 | } 53 | } 54 | ``` 55 | 56 | Behind the scenes, when using the range based for, the parser only ever 57 | allocates as much memory as needed to represent a single row of your CSV. If 58 | that's too much, you can step down to a lower level, where you read from the CSV 59 | a field at a time, which only allocates the amount of memory needed for a single 60 | field. 61 | 62 | ```cpp 63 | #include 64 | #include "./parser.hpp" 65 | 66 | using namespace aria::csv; 67 | 68 | int main() { 69 | CsvParser parser(std::cin); 70 | 71 | for (;;) { 72 | auto field = parser.next_field(); 73 | switch (field.type) { 74 | case FieldType::DATA: 75 | std::cout << *field.data << " | "; 76 | break; 77 | case FieldType::ROW_END: 78 | std::cout << std::endl; 79 | break; 80 | case FieldType::CSV_END: 81 | std::cout << std::endl; 82 | return 0; 83 | } 84 | } 85 | } 86 | ``` 87 | 88 | It is possible to inspect the current cursor position using `parser.position()`. 89 | This will return the position of the last parsed token. This is useful when 90 | reporting things like progress through a file. You can use 91 | `file.seekg(0, std::ios::end);` to get a file size. 92 | 93 | ## Testing 94 | 95 | Run `cmake -B out && cmake --build out && ./out/parser_test` in test dir 96 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark 2 | 3 | Parses `sample.csv`, and outputs the number of rows. 4 | 5 | #### My parser (c++) 6 | Compiled with `clang++ -std=c++11 -O2` 7 | ``` 8 | $ time ./bench.out sample.csv 9 | 36635 10 | 11 | ________________________________________________________ 12 | Executed in 42.11 millis fish external 13 | usr time 30.07 millis 0.08 millis 29.99 millis 14 | sys time 7.14 millis 2.98 millis 4.15 millis 15 | ``` 16 | 17 | #### csv 1.3.0 (rust) 18 | Compiled with `cargo build --release` 19 | ``` 20 | $ time ./rust/target/release/bench sample.csv 21 | 36634 22 | 23 | ________________________________________________________ 24 | Executed in 24.54 millis fish external 25 | usr time 16.08 millis 0.06 millis 16.03 millis 26 | sys time 6.18 millis 2.32 millis 3.86 millis 27 | ``` 28 | 29 | #### csv-parser 3.0.0 (node.js) 30 | ``` 31 | $ time node js/main.js sample.csv 32 | 36634 33 | 34 | ________________________________________________________ 35 | Executed in 194.38 millis fish external 36 | usr time 187.77 millis 0.07 millis 187.70 millis 37 | sys time 18.24 millis 2.35 millis 15.89 millis 38 | ``` 39 | -------------------------------------------------------------------------------- /benchmark/cpp/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "../../parser.hpp" 4 | 5 | using namespace aria::csv; 6 | 7 | int main(int, char **argv) { 8 | int count = 0; 9 | std::ifstream f(argv[1]); 10 | CsvParser parser(f); 11 | 12 | for (auto& row : parser) { 13 | ++count; 14 | } 15 | 16 | std::cout << count << std::endl; 17 | } 18 | -------------------------------------------------------------------------------- /benchmark/js/main.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const csv = require('csv-parser') 3 | 4 | let count = 0 5 | 6 | fs.createReadStream(process.argv[2]) 7 | .pipe(csv()) 8 | .on('data', function(data) { ++count }) 9 | .on('end', () => console.log(count)) 10 | -------------------------------------------------------------------------------- /benchmark/js/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "js", 3 | "lockfileVersion": 3, 4 | "requires": true, 5 | "packages": { 6 | "": { 7 | "dependencies": { 8 | "csv-parser": "^3.0.0" 9 | } 10 | }, 11 | "node_modules/csv-parser": { 12 | "version": "3.0.0", 13 | "resolved": "https://registry.npmjs.org/csv-parser/-/csv-parser-3.0.0.tgz", 14 | "integrity": "sha512-s6OYSXAK3IdKqYO33y09jhypG/bSDHPuyCme/IdEHfWpLf/jKcpitVFyOC6UemgGk8v7Q5u2XE0vvwmanxhGlQ==", 15 | "dependencies": { 16 | "minimist": "^1.2.0" 17 | }, 18 | "bin": { 19 | "csv-parser": "bin/csv-parser" 20 | }, 21 | "engines": { 22 | "node": ">= 10" 23 | } 24 | }, 25 | "node_modules/minimist": { 26 | "version": "1.2.8", 27 | "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", 28 | "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", 29 | "funding": { 30 | "url": "https://github.com/sponsors/ljharb" 31 | } 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /benchmark/js/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "csv-parser": "^3.0.0" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /benchmark/rust/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "bench" 7 | version = "0.0.1" 8 | dependencies = [ 9 | "csv", 10 | ] 11 | 12 | [[package]] 13 | name = "csv" 14 | version = "1.3.0" 15 | source = "registry+https://github.com/rust-lang/crates.io-index" 16 | checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" 17 | dependencies = [ 18 | "csv-core", 19 | "itoa", 20 | "ryu", 21 | "serde", 22 | ] 23 | 24 | [[package]] 25 | name = "csv-core" 26 | version = "0.1.11" 27 | source = "registry+https://github.com/rust-lang/crates.io-index" 28 | checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" 29 | dependencies = [ 30 | "memchr", 31 | ] 32 | 33 | [[package]] 34 | name = "itoa" 35 | version = "1.0.11" 36 | source = "registry+https://github.com/rust-lang/crates.io-index" 37 | checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" 38 | 39 | [[package]] 40 | name = "memchr" 41 | version = "2.7.4" 42 | source = "registry+https://github.com/rust-lang/crates.io-index" 43 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" 44 | 45 | [[package]] 46 | name = "proc-macro2" 47 | version = "1.0.89" 48 | source = "registry+https://github.com/rust-lang/crates.io-index" 49 | checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" 50 | dependencies = [ 51 | "unicode-ident", 52 | ] 53 | 54 | [[package]] 55 | name = "quote" 56 | version = "1.0.37" 57 | source = "registry+https://github.com/rust-lang/crates.io-index" 58 | checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" 59 | dependencies = [ 60 | "proc-macro2", 61 | ] 62 | 63 | [[package]] 64 | name = "ryu" 65 | version = "1.0.18" 66 | source = "registry+https://github.com/rust-lang/crates.io-index" 67 | checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" 68 | 69 | [[package]] 70 | name = "serde" 71 | version = "1.0.213" 72 | source = "registry+https://github.com/rust-lang/crates.io-index" 73 | checksum = "3ea7893ff5e2466df8d720bb615088341b295f849602c6956047f8f80f0e9bc1" 74 | dependencies = [ 75 | "serde_derive", 76 | ] 77 | 78 | [[package]] 79 | name = "serde_derive" 80 | version = "1.0.213" 81 | source = "registry+https://github.com/rust-lang/crates.io-index" 82 | checksum = "7e85ad2009c50b58e87caa8cd6dac16bdf511bbfb7af6c33df902396aa480fa5" 83 | dependencies = [ 84 | "proc-macro2", 85 | "quote", 86 | "syn", 87 | ] 88 | 89 | [[package]] 90 | name = "syn" 91 | version = "2.0.85" 92 | source = "registry+https://github.com/rust-lang/crates.io-index" 93 | checksum = "5023162dfcd14ef8f32034d8bcd4cc5ddc61ef7a247c024a33e24e1f24d21b56" 94 | dependencies = [ 95 | "proc-macro2", 96 | "quote", 97 | "unicode-ident", 98 | ] 99 | 100 | [[package]] 101 | name = "unicode-ident" 102 | version = "1.0.13" 103 | source = "registry+https://github.com/rust-lang/crates.io-index" 104 | checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" 105 | -------------------------------------------------------------------------------- /benchmark/rust/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bench" 3 | version = "0.0.1" 4 | 5 | [dependencies] 6 | csv = "1.3.0" 7 | -------------------------------------------------------------------------------- /benchmark/rust/src/main.rs: -------------------------------------------------------------------------------- 1 | extern crate csv; 2 | 3 | use std::env; 4 | use std::error::Error; 5 | use std::ffi::OsString; 6 | use std::fs::File; 7 | 8 | fn main() { 9 | let mut count = 0; 10 | let file_path = get_first_arg().unwrap(); 11 | let file = File::open(file_path).unwrap(); 12 | let mut rdr = csv::Reader::from_reader(file); 13 | 14 | for _ in rdr.records() { 15 | count += 1; 16 | } 17 | 18 | println!("{}", count); 19 | } 20 | 21 | fn get_first_arg() -> Result> { 22 | match env::args_os().nth(1) { 23 | None => Err(From::from("expected 1 argument, but got none")), 24 | Some(file_path) => Ok(file_path), 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /parser.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ARIA_CSV_H 2 | #define ARIA_CSV_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace aria { 11 | namespace csv { 12 | enum class Term { CRLF = -2 }; 13 | enum class FieldType { DATA, ROW_END, CSV_END }; 14 | using CSV = std::vector>; 15 | 16 | // Checking for '\n', '\r', and '\r\n' by default 17 | inline auto operator==(const char c, const Term t) -> bool { 18 | switch (t) { 19 | case Term::CRLF: 20 | return c == '\r' || c == '\n'; 21 | default: 22 | return static_cast(t) == c; 23 | } 24 | } 25 | 26 | inline auto operator!=(const char c, const Term t) -> bool { return !(c == t); } 27 | 28 | // Wraps returned fields so we can also indicate 29 | // that we hit row endings or the end of the csv itself 30 | struct Field { 31 | explicit Field(FieldType t) : type(t) {} 32 | explicit Field(std::string &&str) 33 | : type(FieldType::DATA), data(std::move(str)) {} 34 | 35 | FieldType type; 36 | std::string data; 37 | }; 38 | 39 | // Reads and parses lines from a csv file 40 | class CsvParser { 41 | private: 42 | // CSV state for state machine 43 | enum class State { 44 | START_OF_FIELD, 45 | IN_FIELD, 46 | IN_QUOTED_FIELD, 47 | IN_ESCAPED_QUOTE, 48 | END_OF_ROW, 49 | EMPTY 50 | }; 51 | State m_state = State::START_OF_FIELD; 52 | 53 | // Configurable attributes 54 | char m_quote = '"'; 55 | char m_delimiter = ','; 56 | Term m_terminator = Term::CRLF; 57 | std::istream *m_input; 58 | 59 | // Buffer capacities 60 | static constexpr int FIELDBUF_CAP = 1024; 61 | static constexpr int INPUTBUF_CAP = 1024 * 128; 62 | 63 | // Buffers 64 | std::string m_fieldbuf{}; 65 | std::vector m_inputbuf = std::vector(INPUTBUF_CAP); 66 | 67 | // Misc 68 | bool m_eof = false; 69 | size_t m_cursor = 0; 70 | size_t m_bytes_read = 0; 71 | std::streamoff m_scanposition = 0; 72 | 73 | public: 74 | // Delete copy constructor and assignment 75 | CsvParser(const CsvParser &) = delete; 76 | auto operator=(const CsvParser &) -> CsvParser & = delete; 77 | 78 | // Allow move operations 79 | CsvParser(CsvParser &&) = default; 80 | auto operator=(CsvParser &&) -> CsvParser & = default; 81 | 82 | // Creates the CSV parser which by default, splits on commas, 83 | // uses quotes to escape, and handles CSV files that end in either 84 | // '\r', '\n', or '\r\n'. 85 | explicit CsvParser(std::istream &input) : m_input(&input) { 86 | // Reserve space upfront to improve performance 87 | m_fieldbuf.reserve(FIELDBUF_CAP); 88 | if (!m_input->good()) { 89 | throw std::runtime_error("Something is wrong with input stream"); 90 | } 91 | } 92 | 93 | // Change the quote character 94 | auto quote(char c) noexcept -> CsvParser && { 95 | m_quote = c; 96 | return std::move(*this); 97 | } 98 | 99 | // Change the delimiter character 100 | auto delimiter(char c) noexcept -> CsvParser && { 101 | m_delimiter = c; 102 | return std::move(*this); 103 | } 104 | 105 | // Change the terminator character 106 | auto terminator(char c) noexcept -> CsvParser && { 107 | m_terminator = static_cast(c); 108 | return std::move(*this); 109 | } 110 | 111 | // The parser is in the empty state when there are 112 | // no more tokens left to read from the input buffer 113 | auto empty() -> bool { return m_state == State::EMPTY; } 114 | 115 | // Not the actual position in the stream (its buffered) just the 116 | // position up to last availiable token 117 | auto position() const -> std::streamoff { 118 | return m_scanposition + static_cast(m_cursor); 119 | } 120 | 121 | // Reads a single field from the CSV 122 | auto next_field() -> Field { 123 | if (empty()) { 124 | return Field(FieldType::CSV_END); 125 | } 126 | m_fieldbuf.clear(); 127 | 128 | // This loop runs until either the parser has 129 | // read a full field or until there's no tokens left to read 130 | for (;;) { 131 | char *maybe_token = top_token(); 132 | 133 | // If we're out of tokens to read return whatever's left in the 134 | // field and row buffers. If there's nothing left, return null. 135 | if (maybe_token == nullptr) { 136 | m_state = State::EMPTY; 137 | return !m_fieldbuf.empty() ? Field(std::move(m_fieldbuf)) 138 | : Field(FieldType::CSV_END); 139 | } 140 | 141 | // Parsing the CSV is done using a finite state machine 142 | char c = *maybe_token; 143 | switch (m_state) { 144 | case State::START_OF_FIELD: 145 | m_cursor++; 146 | if (c == m_terminator) { 147 | handle_crlf(c); 148 | m_state = State::END_OF_ROW; 149 | return Field(std::move(m_fieldbuf)); 150 | } 151 | 152 | if (c == m_quote) { 153 | m_state = State::IN_QUOTED_FIELD; 154 | } else if (c == m_delimiter) { 155 | return Field(std::move(m_fieldbuf)); 156 | } else { 157 | m_state = State::IN_FIELD; 158 | m_fieldbuf += c; 159 | } 160 | 161 | break; 162 | 163 | case State::IN_FIELD: 164 | m_cursor++; 165 | if (c == m_terminator) { 166 | handle_crlf(c); 167 | m_state = State::END_OF_ROW; 168 | return Field(std::move(m_fieldbuf)); 169 | } 170 | 171 | if (c == m_delimiter) { 172 | m_state = State::START_OF_FIELD; 173 | return Field(std::move(m_fieldbuf)); 174 | } 175 | 176 | m_fieldbuf += c; 177 | break; 178 | 179 | case State::IN_QUOTED_FIELD: 180 | m_cursor++; 181 | if (c == m_quote) { 182 | m_state = State::IN_ESCAPED_QUOTE; 183 | } else { 184 | m_fieldbuf += c; 185 | } 186 | 187 | break; 188 | 189 | case State::IN_ESCAPED_QUOTE: 190 | m_cursor++; 191 | if (c == m_terminator) { 192 | handle_crlf(c); 193 | m_state = State::END_OF_ROW; 194 | return Field(std::move(m_fieldbuf)); 195 | } 196 | 197 | if (c == m_quote) { 198 | m_state = State::IN_QUOTED_FIELD; 199 | m_fieldbuf += c; 200 | } else if (c == m_delimiter) { 201 | m_state = State::START_OF_FIELD; 202 | return Field(std::move(m_fieldbuf)); 203 | } else { 204 | m_state = State::IN_FIELD; 205 | m_fieldbuf += c; 206 | } 207 | 208 | break; 209 | 210 | case State::END_OF_ROW: 211 | m_state = State::START_OF_FIELD; 212 | return Field(FieldType::ROW_END); 213 | 214 | case State::EMPTY: 215 | throw std::logic_error("You goofed"); 216 | } 217 | } 218 | } 219 | 220 | private: 221 | // When the parser hits the end of a line it needs 222 | // to check the special case of '\r\n' as a terminator. 223 | // If it finds that the previous token was a '\r', and 224 | // the next token will be a '\n', it skips the '\n'. 225 | void handle_crlf(const char c) { 226 | if (m_terminator != Term::CRLF || c != '\r') { 227 | return; 228 | } 229 | 230 | char *token = top_token(); 231 | if ((token != nullptr) && *token == '\n') { 232 | m_cursor++; 233 | } 234 | } 235 | 236 | // Pulls the next token from the input buffer, but does not move 237 | // the cursor forward. If the stream is empty and the input buffer 238 | // is also empty return a nullptr. 239 | auto top_token() -> char * { 240 | // Return null if there's nothing left to read 241 | if (m_eof && m_cursor == m_bytes_read) { 242 | return nullptr; 243 | } 244 | 245 | // Refill the input buffer if it's been fully read 246 | if (m_cursor == m_bytes_read) { 247 | fill_buffer(); 248 | // Return null if there's nothing left to read 249 | if (m_bytes_read == 0) { 250 | return nullptr; 251 | } 252 | } 253 | 254 | return &m_inputbuf[m_cursor]; 255 | } 256 | 257 | void fill_buffer() { 258 | m_input->read(m_inputbuf.data(), INPUTBUF_CAP); 259 | m_bytes_read = static_cast(m_input->gcount()); 260 | m_eof = m_input->eof(); 261 | m_cursor = 0; 262 | 263 | if (m_scanposition == 0 && m_bytes_read >= 3 && m_inputbuf[0] == '\xEF' && 264 | m_inputbuf[1] == '\xBB' && m_inputbuf[2] == '\xBF') { 265 | if (m_bytes_read > 3) { 266 | m_cursor = 3; 267 | } else { 268 | m_bytes_read = 0; 269 | } 270 | } 271 | 272 | m_scanposition += static_cast(m_bytes_read); 273 | } 274 | 275 | public: 276 | // Iterator implementation for the CSV parser, which reads 277 | // from the CSV row by row in the form of a vector of strings 278 | class iterator { 279 | public: 280 | using difference_type = std::ptrdiff_t; 281 | using value_type = std::vector; 282 | using pointer = const std::vector *; 283 | using reference = const std::vector &; 284 | using iterator_category = std::input_iterator_tag; 285 | 286 | explicit iterator(CsvParser *p, bool end = false) : m_parser(p) { 287 | static constexpr size_t DEFAULT_ROW_CAPACITY = 50; 288 | if (!end) { 289 | m_row.reserve(DEFAULT_ROW_CAPACITY); 290 | m_current_row = 0; 291 | next(); 292 | } 293 | } 294 | 295 | auto operator++() -> iterator & { 296 | next(); 297 | return *this; 298 | } 299 | 300 | auto operator++(int) -> iterator { 301 | iterator i = (*this); 302 | ++(*this); 303 | return i; 304 | } 305 | 306 | auto operator==(const iterator &other) const -> bool { 307 | return m_current_row == other.m_current_row && 308 | m_row.size() == other.m_row.size(); 309 | } 310 | 311 | auto operator!=(const iterator &other) const -> bool { 312 | return !(*this == other); 313 | } 314 | 315 | auto operator*() const -> reference { return m_row; } 316 | 317 | auto operator->() const -> pointer { return &m_row; } 318 | 319 | private: 320 | value_type m_row{}; 321 | CsvParser *m_parser; 322 | int m_current_row = -1; 323 | 324 | void next() { 325 | value_type::size_type num_fields = 0; 326 | for (;;) { 327 | auto field = m_parser->next_field(); 328 | switch (field.type) { 329 | case FieldType::CSV_END: 330 | if (num_fields < m_row.size()) { 331 | m_row.resize(num_fields); 332 | } 333 | m_current_row = -1; 334 | return; 335 | case FieldType::ROW_END: 336 | if (num_fields < m_row.size()) { 337 | m_row.resize(num_fields); 338 | } 339 | m_current_row++; 340 | return; 341 | case FieldType::DATA: 342 | if (num_fields < m_row.size()) { 343 | m_row[num_fields] = std::move(field.data); 344 | } else { 345 | m_row.push_back(std::move(field.data)); 346 | } 347 | num_fields++; 348 | } 349 | } 350 | } 351 | }; 352 | 353 | auto begin() -> iterator { return iterator(this); }; 354 | auto end() -> iterator { return iterator(this, true); }; 355 | }; 356 | } // namespace csv 357 | } // namespace aria 358 | #endif 359 | -------------------------------------------------------------------------------- /test/.clangd: -------------------------------------------------------------------------------- 1 | CompileFlags: 2 | CompilationDatabase: out 3 | 4 | Diagnostics: 5 | ClangTidy: 6 | Remove: 7 | - modernize-use-trailing-return-type 8 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | project(CSVParser) 3 | 4 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON) 5 | add_definitions(-DTEST_DATA_DIR="${CMAKE_CURRENT_SOURCE_DIR}/data") 6 | 7 | # Find or fetch GTest 8 | cmake_policy(SET CMP0135 NEW) 9 | include(FetchContent) 10 | FetchContent_Declare( 11 | googletest 12 | URL https://github.com/google/googletest/archive/refs/tags/v1.12.0.zip 13 | ) 14 | FetchContent_MakeAvailable(googletest) 15 | 16 | # Add test executable 17 | add_executable(parser_test parser_test.cpp) 18 | target_link_libraries(parser_test PRIVATE gtest_main) 19 | -------------------------------------------------------------------------------- /test/data/bom_empty.csv: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /test/data/bom_simple.csv: -------------------------------------------------------------------------------- 1 | a,b,c 2 | 1,2,3 3 | -------------------------------------------------------------------------------- /test/data/comma_in_quotes.csv: -------------------------------------------------------------------------------- 1 | first,last,address,city,zip 2 | John,Doe,120 any st.,"Anytown, WW",08123 -------------------------------------------------------------------------------- /test/data/delimiter.csv: -------------------------------------------------------------------------------- 1 | a;b;c 2 | 1;2;3 3 | 4;5;, 4 | -------------------------------------------------------------------------------- /test/data/empty.csv: -------------------------------------------------------------------------------- 1 | a,b,c 2 | 1,"","" 3 | 2,3,4 -------------------------------------------------------------------------------- /test/data/emptyUnquoted.csv: -------------------------------------------------------------------------------- 1 | a,b,c 2 | 1,, 3 | 2,3,4 -------------------------------------------------------------------------------- /test/data/empty_crlf.csv: -------------------------------------------------------------------------------- 1 | a,b,c 2 | 1,"","" 3 | 2,3,4 -------------------------------------------------------------------------------- /test/data/empty_file.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AriaFallah/csv-parser/43961a918f150088dd0c288b0c9c551e0be795f8/test/data/empty_file.csv -------------------------------------------------------------------------------- /test/data/escaped_quotes.csv: -------------------------------------------------------------------------------- 1 | a,b 2 | 1,"ha ""ha"" ha" 3 | 3,4 4 | -------------------------------------------------------------------------------- /test/data/json.csv: -------------------------------------------------------------------------------- 1 | key,val 2 | 1,"{""type"": ""Point"", ""coordinates"": [102.0, 0.5]}" 3 | -------------------------------------------------------------------------------- /test/data/newlines.csv: -------------------------------------------------------------------------------- 1 | a,b,c 2 | 1,2,3 3 | "Once upon 4 | a time",5,6 5 | 7,8,9 6 | -------------------------------------------------------------------------------- /test/data/newlines_crlf.csv: -------------------------------------------------------------------------------- 1 | a,b,c 2 | 1,2,3 3 | "Once upon 4 | a time",5,6 5 | 7,8,9 6 | -------------------------------------------------------------------------------- /test/data/quote.csv: -------------------------------------------------------------------------------- 1 | '1, 2, 3','4, 5, 6',' 2 | 7 3 | 8 4 | 9' 5 | -------------------------------------------------------------------------------- /test/data/quotes_and_newlines.csv: -------------------------------------------------------------------------------- 1 | a,b 2 | 1,"ha 3 | ""ha"" 4 | ha" 5 | 3,4 6 | -------------------------------------------------------------------------------- /test/data/simple.csv: -------------------------------------------------------------------------------- 1 | a,b,c 2 | 1,2,3 3 | -------------------------------------------------------------------------------- /test/data/simple_crlf.csv: -------------------------------------------------------------------------------- 1 | a,b,c 2 | 1,2,3 3 | -------------------------------------------------------------------------------- /test/data/terminator.csv: -------------------------------------------------------------------------------- 1 | a,b,c;1,2,3;4,5,6 2 | -------------------------------------------------------------------------------- /test/data/utf8.csv: -------------------------------------------------------------------------------- 1 | a,b,c 2 | 1,2,3 3 | 4,5,ʤ -------------------------------------------------------------------------------- /test/parser_test.cpp: -------------------------------------------------------------------------------- 1 | #include "../parser.hpp" 2 | #include 3 | #include 4 | 5 | using namespace aria::csv; 6 | 7 | auto read_all(CsvParser &p) -> CSV { 8 | CSV csv; 9 | for (const auto &row : p) { 10 | csv.push_back(row); 11 | } 12 | return csv; 13 | } 14 | 15 | TEST(CsvParserTest, CommaInQuotes) { 16 | std::ifstream f(TEST_DATA_DIR "/comma_in_quotes.csv"); 17 | CsvParser parser(f); 18 | CSV expected = {{"first", "last", "address", "city", "zip"}, 19 | {"John", "Doe", "120 any st.", "Anytown, WW", "08123"}}; 20 | EXPECT_EQ(read_all(parser), expected); 21 | } 22 | 23 | TEST(CsvParserTest, Empty) { 24 | std::ifstream f(TEST_DATA_DIR "/empty.csv"); 25 | CsvParser parser(f); 26 | CSV expected = {{"a", "b", "c"}, {"1", "", ""}, {"2", "3", "4"}}; 27 | EXPECT_EQ(read_all(parser), expected); 28 | } 29 | 30 | TEST(CsvParserTest, EmptyUnquoted) { 31 | std::ifstream f(TEST_DATA_DIR "/emptyUnquoted.csv"); 32 | CsvParser parser(f); 33 | CSV expected = {{"a", "b", "c"}, {"1", "", ""}, {"2", "3", "4"}}; 34 | EXPECT_EQ(read_all(parser), expected); 35 | } 36 | 37 | TEST(CsvParserTest, EmptyCrlf) { 38 | std::ifstream f(TEST_DATA_DIR "/empty_crlf.csv"); 39 | CsvParser parser(f); 40 | CSV expected = {{"a", "b", "c"}, {"1", "", ""}, {"2", "3", "4"}}; 41 | EXPECT_EQ(read_all(parser), expected); 42 | } 43 | 44 | TEST(CsvParserTest, EscapedQuotes) { 45 | std::ifstream f(TEST_DATA_DIR "/escaped_quotes.csv"); 46 | CsvParser parser(f); 47 | CSV expected = {{"a", "b"}, {"1", R"(ha "ha" ha)"}, {"3", "4"}}; 48 | EXPECT_EQ(read_all(parser), expected); 49 | } 50 | 51 | TEST(CsvParserTest, Json) { 52 | std::ifstream f(TEST_DATA_DIR "/json.csv"); 53 | CsvParser parser(f); 54 | CSV expected = {{"key", "val"}, 55 | {"1", R"({"type": "Point", "coordinates": [102.0, 0.5]})"}}; 56 | EXPECT_EQ(read_all(parser), expected); 57 | } 58 | 59 | TEST(CsvParserTest, Newlines) { 60 | std::ifstream f(TEST_DATA_DIR "/newlines.csv"); 61 | CsvParser parser(f); 62 | CSV expected = {{"a", "b", "c"}, 63 | {"1", "2", "3"}, 64 | {"Once upon \na time", "5", "6"}, 65 | {"7", "8", "9"}}; 66 | EXPECT_EQ(read_all(parser), expected); 67 | } 68 | 69 | TEST(CsvParserTest, NewlinesCrlf) { 70 | std::ifstream f(TEST_DATA_DIR "/newlines_crlf.csv"); 71 | CsvParser parser(f); 72 | CSV expected = {{"a", "b", "c"}, 73 | {"1", "2", "3"}, 74 | {"Once upon \r\na time", "5", "6"}, 75 | {"7", "8", "9"}}; 76 | EXPECT_EQ(read_all(parser), expected); 77 | } 78 | 79 | TEST(CsvParserTest, QuotesAndNewlines) { 80 | std::ifstream f(TEST_DATA_DIR "/quotes_and_newlines.csv"); 81 | CsvParser parser(f); 82 | CSV expected = {{"a", "b"}, {"1", "ha \n\"ha\" \nha"}, {"3", "4"}}; 83 | EXPECT_EQ(read_all(parser), expected); 84 | } 85 | 86 | TEST(CsvParserTest, Simple) { 87 | std::ifstream f(TEST_DATA_DIR "/simple.csv"); 88 | CsvParser parser(f); 89 | CSV expected = {{"a", "b", "c"}, {"1", "2", "3"}}; 90 | EXPECT_EQ(read_all(parser), expected); 91 | } 92 | 93 | TEST(CsvParserTest, SimpleCrlf) { 94 | std::ifstream f(TEST_DATA_DIR "/simple_crlf.csv"); 95 | CsvParser parser(f); 96 | CSV expected = {{"a", "b", "c"}, {"1", "2", "3"}}; 97 | EXPECT_EQ(read_all(parser), expected); 98 | } 99 | 100 | TEST(CsvParserTest, Utf8) { 101 | std::ifstream f(TEST_DATA_DIR "/utf8.csv"); 102 | CsvParser parser(f); 103 | CSV expected = {{"a", "b", "c"}, {"1", "2", "3"}, {"4", "5", "ʤ"}}; 104 | EXPECT_EQ(read_all(parser), expected); 105 | } 106 | 107 | TEST(CsvParserTest, DifferentDelimiter) { 108 | std::ifstream f(TEST_DATA_DIR "/delimiter.csv"); 109 | CsvParser parser = CsvParser(f).delimiter(';'); 110 | CSV expected = {{"a", "b", "c"}, {"1", "2", "3"}, {"4", "5", ","}}; 111 | EXPECT_EQ(read_all(parser), expected); 112 | } 113 | 114 | TEST(CsvParserTest, DifferentTerminator) { 115 | std::ifstream f(TEST_DATA_DIR "/terminator.csv"); 116 | CsvParser parser = CsvParser(f).terminator(';'); 117 | CSV expected = {{"a", "b", "c"}, {"1", "2", "3"}, {"4", "5", "6\n"}}; 118 | EXPECT_EQ(read_all(parser), expected); 119 | } 120 | 121 | TEST(CsvParserTest, DifferentQuote) { 122 | std::ifstream f(TEST_DATA_DIR "/quote.csv"); 123 | CsvParser parser = CsvParser(f).quote('\''); 124 | CSV expected = {{"1, 2, 3", "4, 5, 6", "\n7\n8\n9"}}; 125 | EXPECT_EQ(read_all(parser), expected); 126 | } 127 | 128 | TEST(CsvParserTest, BomSimple) { 129 | std::ifstream f(TEST_DATA_DIR "/bom_simple.csv"); 130 | CsvParser parser(f); 131 | CSV expected = {{"a", "b", "c"}, {"1", "2", "3"}}; 132 | EXPECT_EQ(read_all(parser), expected); 133 | } 134 | 135 | TEST(CsvParserTest, BomEmpty) { 136 | std::ifstream f(TEST_DATA_DIR "/bom_empty.csv"); 137 | CsvParser parser(f); 138 | CSV expected = {}; 139 | EXPECT_EQ(read_all(parser), expected); 140 | } 141 | 142 | TEST(CsvParserTest, EmptyFile) { 143 | std::ifstream f(TEST_DATA_DIR "/empty_file.csv"); 144 | CsvParser parser(f); 145 | CSV expected = {}; 146 | EXPECT_EQ(read_all(parser), expected); 147 | } 148 | --------------------------------------------------------------------------------