├── .clangd
├── .gitignore
├── CMakeLists.txt
├── Config.cmake.in
├── LICENSE
├── README.md
├── benchmark
    ├── README.md
    ├── cpp
    │   └── main.cpp
    ├── js
    │   ├── main.js
    │   ├── package-lock.json
    │   └── package.json
    ├── rust
    │   ├── Cargo.lock
    │   ├── Cargo.toml
    │   └── src
    │   │   └── main.rs
    └── sample.csv
├── parser.hpp
└── test
    ├── .clangd
    ├── CMakeLists.txt
    ├── data
        ├── bom_empty.csv
        ├── bom_simple.csv
        ├── comma_in_quotes.csv
        ├── delimiter.csv
        ├── empty.csv
        ├── emptyUnquoted.csv
        ├── empty_crlf.csv
        ├── empty_file.csv
        ├── escaped_quotes.csv
        ├── json.csv
        ├── newlines.csv
        ├── newlines_crlf.csv
        ├── quote.csv
        ├── quotes_and_newlines.csv
        ├── simple.csv
        ├── simple_crlf.csv
        ├── terminator.csv
        └── utf8.csv
    └── parser_test.cpp


/.clangd:
--------------------------------------------------------------------------------
 1 | CompileFlags:
 2 |   Add:
 3 |     - -std=c++11
 4 |     - -Wall
 5 |     - -Wextra
 6 |     - -Werror
 7 |     - -pedantic
 8 |     - -I.
 9 |     - -Iinclude
10 |     - -Isrc
11 | 
12 | Diagnostics:
13 |   ClangTidy:
14 |     Add:
15 |       - modernize*
16 |       - performance*
17 |       - bugprone*
18 |       - readability*
19 |     Remove:
20 |       - readability-identifier-length
21 |       - readability-function-cognitive-complexity
22 | 
23 | Index:
24 |   Background: Build
25 | 
26 | InlayHints:
27 |   Enabled: Yes
28 |   ParameterNames: Yes
29 |   DeducedTypes: Yes
30 | 
31 | Style:
32 |   FullyQualifiedNamespaces: No
33 | 
34 | Hover:
35 |   ShowAKA: Yes
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.o*
2 | node_modules
3 | target
4 | out
5 | .DS_Store
6 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(AriaCsvParser)
 2 | cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 3 | include(GNUInstallDirs)
 4 | include(CMakePackageConfigHelpers)
 5 | 
 6 | add_library(${PROJECT_NAME} INTERFACE)
 7 | 
 8 | set_target_properties(${PROJECT_NAME} PROPERTIES
 9 |     PUBLIC_HEADER parser.hpp
10 | )
11 | 
12 | target_include_directories( ${PROJECT_NAME}
13 |                 INTERFACE
14 |                 $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
15 |                 $<INSTALL_INTERFACE:include>
16 |                 )
17 | 
18 | install(TARGETS ${PROJECT_NAME}
19 |         EXPORT ${PROJECT_NAME}Targets)
20 | 
21 | install(EXPORT ${PROJECT_NAME}Targets
22 |     NAMESPACE AriaCsvParser::
23 |     FILE AriaCsvParserTargets.cmake
24 |     DESTINATION lib/cmake/${PROJECT_NAME}
25 | )
26 | 
27 | configure_package_config_file(Config.cmake.in 
28 |     ${PROJECT_NAME}Config.cmake
29 |     INSTALL_DESTINATION "lib/cmake/${PROJECT_NAME}"
30 |     NO_SET_AND_CHECK_MACRO
31 |     NO_CHECK_REQUIRED_COMPONENTS_MACRO
32 |     )
33 | 
34 | install(FILES
35 |     ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake
36 |     DESTINATION lib/cmake/${PROJECT_NAME}
37 |     )


--------------------------------------------------------------------------------
/Config.cmake.in:
--------------------------------------------------------------------------------
1 | @PACKAGE_INIT@
2 | 
3 | include ( "${CMAKE_CURRENT_LIST_DIR}/AriaCsvParserTargets.cmake" )
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Aria Fallah
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CSV Parser
 2 | 
 3 | Fast, simple, header-only, C++11 CSV parser.
 4 | 
 5 | ## Usage
 6 | 
 7 | #### Configuration
 8 | 
 9 | You initialize the parser by passing it any input stream of characters. For
10 | example, you can read from a file
11 | 
12 | ```cpp
13 | std::ifstream f("some_file.csv");
14 | CsvParser parser(f);
15 | ```
16 | 
17 | or you can read from `stdin`
18 | 
19 | ```cpp
20 | CsvParser parser(std::cin);
21 | ```
22 | 
23 | Moreover, you can configure the parser by chaining configuration methods like
24 | 
25 | ```cpp
26 | CsvParser parser = CsvParser(std::cin)
27 |   .delimiter(';')    // delimited by ; instead of ,
28 |   .quote('\'')       // quoted fields use ' instead of "
29 |   .terminator('\0'); // terminated by \0 instead of by \r\n, \n, or \r
30 | ```
31 | 
32 | #### Parsing
33 | 
34 | You can read from the CSV using a range based for loop. Each row of the CSV is
35 | represented as a `std::vector<std::string>`.
36 | 
37 | ```cpp
38 | #include <iostream>
39 | #include "../parser.hpp"
40 | 
41 | using namespace aria::csv;
42 | 
43 | int main() {
44 |   std::ifstream f("some_file.csv");
45 |   CsvParser parser(f);
46 | 
47 |   for (auto& row : parser) {
48 |     for (auto& field : row) {
49 |       std::cout << field << " | ";
50 |     }
51 |     std::cout << std::endl;
52 |   }
53 | }
54 | ```
55 | 
56 | Behind the scenes, when using the range based for, the parser only ever
57 | allocates as much memory as needed to represent a single row of your CSV. If
58 | that's too much, you can step down to a lower level, where you read from the CSV
59 | a field at a time, which only allocates the amount of memory needed for a single
60 | field.
61 | 
62 | ```cpp
63 | #include <iostream>
64 | #include "./parser.hpp"
65 | 
66 | using namespace aria::csv;
67 | 
68 | int main() {
69 |   CsvParser parser(std::cin);
70 | 
71 |   for (;;) {
72 |     auto field = parser.next_field();
73 |     switch (field.type) {
74 |       case FieldType::DATA:
75 |         std::cout << *field.data << " | ";
76 |         break;
77 |       case FieldType::ROW_END:
78 |         std::cout << std::endl;
79 |         break;
80 |       case FieldType::CSV_END:
81 |         std::cout << std::endl;
82 |         return 0;
83 |     }
84 |   }
85 | }
86 | ```
87 | 
88 | It is possible to inspect the current cursor position using `parser.position()`.
89 | This will return the position of the last parsed token. This is useful when
90 | reporting things like progress through a file. You can use
91 | `file.seekg(0, std::ios::end);` to get a file size.
92 | 
93 | ## Testing
94 | 
95 | Run `cmake -B out && cmake --build out && ./out/parser_test` in test dir
96 | 


--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmark
 2 | 
 3 | Parses `sample.csv`, and outputs the number of rows.
 4 | 
 5 | #### My parser (c++)
 6 | Compiled with `clang++ -std=c++11 -O2`
 7 | ```
 8 | $ time ./bench.out sample.csv
 9 | 36635
10 | 
11 | ________________________________________________________
12 | Executed in   42.11 millis    fish           external
13 |    usr time   30.07 millis    0.08 millis   29.99 millis
14 |    sys time    7.14 millis    2.98 millis    4.15 millis
15 | ```
16 | 
17 | #### csv 1.3.0 (rust)
18 | Compiled with `cargo build --release`
19 | ```
20 | $ time ./rust/target/release/bench sample.csv
21 | 36634
22 | 
23 | ________________________________________________________
24 | Executed in   24.54 millis    fish           external
25 |    usr time   16.08 millis    0.06 millis   16.03 millis
26 |    sys time    6.18 millis    2.32 millis    3.86 millis
27 | ```
28 | 
29 | #### csv-parser 3.0.0 (node.js)
30 | ```
31 | $ time node js/main.js sample.csv
32 | 36634
33 | 
34 | ________________________________________________________
35 | Executed in  194.38 millis    fish           external
36 |    usr time  187.77 millis    0.07 millis  187.70 millis
37 |    sys time   18.24 millis    2.35 millis   15.89 millis
38 | ```
39 | 


--------------------------------------------------------------------------------
/benchmark/cpp/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <fstream>
 2 | #include <iostream>
 3 | #include "../../parser.hpp"
 4 | 
 5 | using namespace aria::csv;
 6 | 
 7 | int main(int, char **argv) {
 8 |   int count = 0;
 9 |   std::ifstream f(argv[1]);
10 |   CsvParser parser(f);
11 | 
12 |   for (auto& row : parser) {
13 |     ++count;
14 |   }
15 | 
16 |   std::cout << count << std::endl;
17 | }
18 | 


--------------------------------------------------------------------------------
/benchmark/js/main.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs')
 2 | const csv = require('csv-parser')
 3 | 
 4 | let count = 0
 5 | 
 6 | fs.createReadStream(process.argv[2])
 7 |   .pipe(csv())
 8 |   .on('data', function(data) { ++count })
 9 |   .on('end', () => console.log(count))
10 | 


--------------------------------------------------------------------------------
/benchmark/js/package-lock.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "js",
 3 |   "lockfileVersion": 3,
 4 |   "requires": true,
 5 |   "packages": {
 6 |     "": {
 7 |       "dependencies": {
 8 |         "csv-parser": "^3.0.0"
 9 |       }
10 |     },
11 |     "node_modules/csv-parser": {
12 |       "version": "3.0.0",
13 |       "resolved": "https://registry.npmjs.org/csv-parser/-/csv-parser-3.0.0.tgz",
14 |       "integrity": "sha512-s6OYSXAK3IdKqYO33y09jhypG/bSDHPuyCme/IdEHfWpLf/jKcpitVFyOC6UemgGk8v7Q5u2XE0vvwmanxhGlQ==",
15 |       "dependencies": {
16 |         "minimist": "^1.2.0"
17 |       },
18 |       "bin": {
19 |         "csv-parser": "bin/csv-parser"
20 |       },
21 |       "engines": {
22 |         "node": ">= 10"
23 |       }
24 |     },
25 |     "node_modules/minimist": {
26 |       "version": "1.2.8",
27 |       "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
28 |       "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==",
29 |       "funding": {
30 |         "url": "https://github.com/sponsors/ljharb"
31 |       }
32 |     }
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/benchmark/js/package.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dependencies": {
3 |     "csv-parser": "^3.0.0"
4 |   }
5 | }
6 | 


--------------------------------------------------------------------------------
/benchmark/rust/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 3
  4 | 
  5 | [[package]]
  6 | name = "bench"
  7 | version = "0.0.1"
  8 | dependencies = [
  9 |  "csv",
 10 | ]
 11 | 
 12 | [[package]]
 13 | name = "csv"
 14 | version = "1.3.0"
 15 | source = "registry+https://github.com/rust-lang/crates.io-index"
 16 | checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe"
 17 | dependencies = [
 18 |  "csv-core",
 19 |  "itoa",
 20 |  "ryu",
 21 |  "serde",
 22 | ]
 23 | 
 24 | [[package]]
 25 | name = "csv-core"
 26 | version = "0.1.11"
 27 | source = "registry+https://github.com/rust-lang/crates.io-index"
 28 | checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
 29 | dependencies = [
 30 |  "memchr",
 31 | ]
 32 | 
 33 | [[package]]
 34 | name = "itoa"
 35 | version = "1.0.11"
 36 | source = "registry+https://github.com/rust-lang/crates.io-index"
 37 | checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 38 | 
 39 | [[package]]
 40 | name = "memchr"
 41 | version = "2.7.4"
 42 | source = "registry+https://github.com/rust-lang/crates.io-index"
 43 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 44 | 
 45 | [[package]]
 46 | name = "proc-macro2"
 47 | version = "1.0.89"
 48 | source = "registry+https://github.com/rust-lang/crates.io-index"
 49 | checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e"
 50 | dependencies = [
 51 |  "unicode-ident",
 52 | ]
 53 | 
 54 | [[package]]
 55 | name = "quote"
 56 | version = "1.0.37"
 57 | source = "registry+https://github.com/rust-lang/crates.io-index"
 58 | checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
 59 | dependencies = [
 60 |  "proc-macro2",
 61 | ]
 62 | 
 63 | [[package]]
 64 | name = "ryu"
 65 | version = "1.0.18"
 66 | source = "registry+https://github.com/rust-lang/crates.io-index"
 67 | checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
 68 | 
 69 | [[package]]
 70 | name = "serde"
 71 | version = "1.0.213"
 72 | source = "registry+https://github.com/rust-lang/crates.io-index"
 73 | checksum = "3ea7893ff5e2466df8d720bb615088341b295f849602c6956047f8f80f0e9bc1"
 74 | dependencies = [
 75 |  "serde_derive",
 76 | ]
 77 | 
 78 | [[package]]
 79 | name = "serde_derive"
 80 | version = "1.0.213"
 81 | source = "registry+https://github.com/rust-lang/crates.io-index"
 82 | checksum = "7e85ad2009c50b58e87caa8cd6dac16bdf511bbfb7af6c33df902396aa480fa5"
 83 | dependencies = [
 84 |  "proc-macro2",
 85 |  "quote",
 86 |  "syn",
 87 | ]
 88 | 
 89 | [[package]]
 90 | name = "syn"
 91 | version = "2.0.85"
 92 | source = "registry+https://github.com/rust-lang/crates.io-index"
 93 | checksum = "5023162dfcd14ef8f32034d8bcd4cc5ddc61ef7a247c024a33e24e1f24d21b56"
 94 | dependencies = [
 95 |  "proc-macro2",
 96 |  "quote",
 97 |  "unicode-ident",
 98 | ]
 99 | 
100 | [[package]]
101 | name = "unicode-ident"
102 | version = "1.0.13"
103 | source = "registry+https://github.com/rust-lang/crates.io-index"
104 | checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
105 | 


--------------------------------------------------------------------------------
/benchmark/rust/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "bench"
3 | version = "0.0.1"
4 | 
5 | [dependencies]
6 | csv = "1.3.0"
7 | 


--------------------------------------------------------------------------------
/benchmark/rust/src/main.rs:
--------------------------------------------------------------------------------
 1 | extern crate csv;
 2 | 
 3 | use std::env;
 4 | use std::error::Error;
 5 | use std::ffi::OsString;
 6 | use std::fs::File;
 7 | 
 8 | fn main() {
 9 |     let mut count = 0;
10 |     let file_path = get_first_arg().unwrap();
11 |     let file = File::open(file_path).unwrap();
12 |     let mut rdr = csv::Reader::from_reader(file);
13 | 
14 |     for _ in rdr.records() {
15 |         count += 1;
16 |     }
17 | 
18 |     println!("{}", count);
19 | }
20 | 
21 | fn get_first_arg() -> Result<OsString, Box<Error>> {
22 |     match env::args_os().nth(1) {
23 |         None => Err(From::from("expected 1 argument, but got none")),
24 |         Some(file_path) => Ok(file_path),
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/parser.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef ARIA_CSV_H
  2 | #define ARIA_CSV_H
  3 | 
  4 | #include <fstream>
  5 | #include <memory>
  6 | #include <stdexcept>
  7 | #include <string>
  8 | #include <vector>
  9 | 
 10 | namespace aria {
 11 | namespace csv {
 12 | enum class Term { CRLF = -2 };
 13 | enum class FieldType { DATA, ROW_END, CSV_END };
 14 | using CSV = std::vector<std::vector<std::string>>;
 15 | 
 16 | // Checking for '\n', '\r', and '\r\n' by default
 17 | inline auto operator==(const char c, const Term t) -> bool {
 18 |   switch (t) {
 19 |   case Term::CRLF:
 20 |     return c == '\r' || c == '\n';
 21 |   default:
 22 |     return static_cast<char>(t) == c;
 23 |   }
 24 | }
 25 | 
 26 | inline auto operator!=(const char c, const Term t) -> bool { return !(c == t); }
 27 | 
 28 | // Wraps returned fields so we can also indicate
 29 | // that we hit row endings or the end of the csv itself
 30 | struct Field {
 31 |   explicit Field(FieldType t) : type(t) {}
 32 |   explicit Field(std::string &&str)
 33 |       : type(FieldType::DATA), data(std::move(str)) {}
 34 | 
 35 |   FieldType type;
 36 |   std::string data;
 37 | };
 38 | 
 39 | // Reads and parses lines from a csv file
 40 | class CsvParser {
 41 | private:
 42 |   // CSV state for state machine
 43 |   enum class State {
 44 |     START_OF_FIELD,
 45 |     IN_FIELD,
 46 |     IN_QUOTED_FIELD,
 47 |     IN_ESCAPED_QUOTE,
 48 |     END_OF_ROW,
 49 |     EMPTY
 50 |   };
 51 |   State m_state = State::START_OF_FIELD;
 52 | 
 53 |   // Configurable attributes
 54 |   char m_quote = '"';
 55 |   char m_delimiter = ',';
 56 |   Term m_terminator = Term::CRLF;
 57 |   std::istream *m_input;
 58 | 
 59 |   // Buffer capacities
 60 |   static constexpr int FIELDBUF_CAP = 1024;
 61 |   static constexpr int INPUTBUF_CAP = 1024 * 128;
 62 | 
 63 |   // Buffers
 64 |   std::string m_fieldbuf{};
 65 |   std::vector<char> m_inputbuf = std::vector<char>(INPUTBUF_CAP);
 66 | 
 67 |   // Misc
 68 |   bool m_eof = false;
 69 |   size_t m_cursor = 0;
 70 |   size_t m_bytes_read = 0;
 71 |   std::streamoff m_scanposition = 0;
 72 | 
 73 | public:
 74 |   // Delete copy constructor and assignment
 75 |   CsvParser(const CsvParser &) = delete;
 76 |   auto operator=(const CsvParser &) -> CsvParser & = delete;
 77 | 
 78 |   // Allow move operations
 79 |   CsvParser(CsvParser &&) = default;
 80 |   auto operator=(CsvParser &&) -> CsvParser & = default;
 81 | 
 82 |   // Creates the CSV parser which by default, splits on commas,
 83 |   // uses quotes to escape, and handles CSV files that end in either
 84 |   // '\r', '\n', or '\r\n'.
 85 |   explicit CsvParser(std::istream &input) : m_input(&input) {
 86 |     // Reserve space upfront to improve performance
 87 |     m_fieldbuf.reserve(FIELDBUF_CAP);
 88 |     if (!m_input->good()) {
 89 |       throw std::runtime_error("Something is wrong with input stream");
 90 |     }
 91 |   }
 92 | 
 93 |   // Change the quote character
 94 |   auto quote(char c) noexcept -> CsvParser && {
 95 |     m_quote = c;
 96 |     return std::move(*this);
 97 |   }
 98 | 
 99 |   // Change the delimiter character
100 |   auto delimiter(char c) noexcept -> CsvParser && {
101 |     m_delimiter = c;
102 |     return std::move(*this);
103 |   }
104 | 
105 |   // Change the terminator character
106 |   auto terminator(char c) noexcept -> CsvParser && {
107 |     m_terminator = static_cast<Term>(c);
108 |     return std::move(*this);
109 |   }
110 | 
111 |   // The parser is in the empty state when there are
112 |   // no more tokens left to read from the input buffer
113 |   auto empty() -> bool { return m_state == State::EMPTY; }
114 | 
115 |   // Not the actual position in the stream (its buffered) just the
116 |   // position up to last availiable token
117 |   auto position() const -> std::streamoff {
118 |     return m_scanposition + static_cast<std::streamoff>(m_cursor);
119 |   }
120 | 
121 |   // Reads a single field from the CSV
122 |   auto next_field() -> Field {
123 |     if (empty()) {
124 |       return Field(FieldType::CSV_END);
125 |     }
126 |     m_fieldbuf.clear();
127 | 
128 |     // This loop runs until either the parser has
129 |     // read a full field or until there's no tokens left to read
130 |     for (;;) {
131 |       char *maybe_token = top_token();
132 | 
133 |       // If we're out of tokens to read return whatever's left in the
134 |       // field and row buffers. If there's nothing left, return null.
135 |       if (maybe_token == nullptr) {
136 |         m_state = State::EMPTY;
137 |         return !m_fieldbuf.empty() ? Field(std::move(m_fieldbuf))
138 |                                    : Field(FieldType::CSV_END);
139 |       }
140 | 
141 |       // Parsing the CSV is done using a finite state machine
142 |       char c = *maybe_token;
143 |       switch (m_state) {
144 |       case State::START_OF_FIELD:
145 |         m_cursor++;
146 |         if (c == m_terminator) {
147 |           handle_crlf(c);
148 |           m_state = State::END_OF_ROW;
149 |           return Field(std::move(m_fieldbuf));
150 |         }
151 | 
152 |         if (c == m_quote) {
153 |           m_state = State::IN_QUOTED_FIELD;
154 |         } else if (c == m_delimiter) {
155 |           return Field(std::move(m_fieldbuf));
156 |         } else {
157 |           m_state = State::IN_FIELD;
158 |           m_fieldbuf += c;
159 |         }
160 | 
161 |         break;
162 | 
163 |       case State::IN_FIELD:
164 |         m_cursor++;
165 |         if (c == m_terminator) {
166 |           handle_crlf(c);
167 |           m_state = State::END_OF_ROW;
168 |           return Field(std::move(m_fieldbuf));
169 |         }
170 | 
171 |         if (c == m_delimiter) {
172 |           m_state = State::START_OF_FIELD;
173 |           return Field(std::move(m_fieldbuf));
174 |         }
175 | 
176 |         m_fieldbuf += c;
177 |         break;
178 | 
179 |       case State::IN_QUOTED_FIELD:
180 |         m_cursor++;
181 |         if (c == m_quote) {
182 |           m_state = State::IN_ESCAPED_QUOTE;
183 |         } else {
184 |           m_fieldbuf += c;
185 |         }
186 | 
187 |         break;
188 | 
189 |       case State::IN_ESCAPED_QUOTE:
190 |         m_cursor++;
191 |         if (c == m_terminator) {
192 |           handle_crlf(c);
193 |           m_state = State::END_OF_ROW;
194 |           return Field(std::move(m_fieldbuf));
195 |         }
196 | 
197 |         if (c == m_quote) {
198 |           m_state = State::IN_QUOTED_FIELD;
199 |           m_fieldbuf += c;
200 |         } else if (c == m_delimiter) {
201 |           m_state = State::START_OF_FIELD;
202 |           return Field(std::move(m_fieldbuf));
203 |         } else {
204 |           m_state = State::IN_FIELD;
205 |           m_fieldbuf += c;
206 |         }
207 | 
208 |         break;
209 | 
210 |       case State::END_OF_ROW:
211 |         m_state = State::START_OF_FIELD;
212 |         return Field(FieldType::ROW_END);
213 | 
214 |       case State::EMPTY:
215 |         throw std::logic_error("You goofed");
216 |       }
217 |     }
218 |   }
219 | 
220 | private:
221 |   // When the parser hits the end of a line it needs
222 |   // to check the special case of '\r\n' as a terminator.
223 |   // If it finds that the previous token was a '\r', and
224 |   // the next token will be a '\n', it skips the '\n'.
225 |   void handle_crlf(const char c) {
226 |     if (m_terminator != Term::CRLF || c != '\r') {
227 |       return;
228 |     }
229 | 
230 |     char *token = top_token();
231 |     if ((token != nullptr) && *token == '\n') {
232 |       m_cursor++;
233 |     }
234 |   }
235 | 
236 |   // Pulls the next token from the input buffer, but does not move
237 |   // the cursor forward. If the stream is empty and the input buffer
238 |   // is also empty return a nullptr.
239 |   auto top_token() -> char * {
240 |     // Return null if there's nothing left to read
241 |     if (m_eof && m_cursor == m_bytes_read) {
242 |       return nullptr;
243 |     }
244 | 
245 |     // Refill the input buffer if it's been fully read
246 |     if (m_cursor == m_bytes_read) {
247 |       fill_buffer();
248 |       // Return null if there's nothing left to read
249 |       if (m_bytes_read == 0) {
250 |         return nullptr;
251 |       }
252 |     }
253 | 
254 |     return &m_inputbuf[m_cursor];
255 |   }
256 | 
257 |   void fill_buffer() {
258 |     m_input->read(m_inputbuf.data(), INPUTBUF_CAP);
259 |     m_bytes_read = static_cast<size_t>(m_input->gcount());
260 |     m_eof = m_input->eof();
261 |     m_cursor = 0;
262 | 
263 |     if (m_scanposition == 0 && m_bytes_read >= 3 && m_inputbuf[0] == '\xEF' &&
264 |         m_inputbuf[1] == '\xBB' && m_inputbuf[2] == '\xBF') {
265 |       if (m_bytes_read > 3) {
266 |         m_cursor = 3;
267 |       } else {
268 |         m_bytes_read = 0;
269 |       }
270 |     }
271 | 
272 |     m_scanposition += static_cast<std::streamoff>(m_bytes_read);
273 |   }
274 | 
275 | public:
276 |   // Iterator implementation for the CSV parser, which reads
277 |   // from the CSV row by row in the form of a vector of strings
278 |   class iterator {
279 |   public:
280 |     using difference_type = std::ptrdiff_t;
281 |     using value_type = std::vector<std::string>;
282 |     using pointer = const std::vector<std::string> *;
283 |     using reference = const std::vector<std::string> &;
284 |     using iterator_category = std::input_iterator_tag;
285 | 
286 |     explicit iterator(CsvParser *p, bool end = false) : m_parser(p) {
287 |       static constexpr size_t DEFAULT_ROW_CAPACITY = 50;
288 |       if (!end) {
289 |         m_row.reserve(DEFAULT_ROW_CAPACITY);
290 |         m_current_row = 0;
291 |         next();
292 |       }
293 |     }
294 | 
295 |     auto operator++() -> iterator & {
296 |       next();
297 |       return *this;
298 |     }
299 | 
300 |     auto operator++(int) -> iterator {
301 |       iterator i = (*this);
302 |       ++(*this);
303 |       return i;
304 |     }
305 | 
306 |     auto operator==(const iterator &other) const -> bool {
307 |       return m_current_row == other.m_current_row &&
308 |              m_row.size() == other.m_row.size();
309 |     }
310 | 
311 |     auto operator!=(const iterator &other) const -> bool {
312 |       return !(*this == other);
313 |     }
314 | 
315 |     auto operator*() const -> reference { return m_row; }
316 | 
317 |     auto operator->() const -> pointer { return &m_row; }
318 | 
319 |   private:
320 |     value_type m_row{};
321 |     CsvParser *m_parser;
322 |     int m_current_row = -1;
323 | 
324 |     void next() {
325 |       value_type::size_type num_fields = 0;
326 |       for (;;) {
327 |         auto field = m_parser->next_field();
328 |         switch (field.type) {
329 |         case FieldType::CSV_END:
330 |           if (num_fields < m_row.size()) {
331 |             m_row.resize(num_fields);
332 |           }
333 |           m_current_row = -1;
334 |           return;
335 |         case FieldType::ROW_END:
336 |           if (num_fields < m_row.size()) {
337 |             m_row.resize(num_fields);
338 |           }
339 |           m_current_row++;
340 |           return;
341 |         case FieldType::DATA:
342 |           if (num_fields < m_row.size()) {
343 |             m_row[num_fields] = std::move(field.data);
344 |           } else {
345 |             m_row.push_back(std::move(field.data));
346 |           }
347 |           num_fields++;
348 |         }
349 |       }
350 |     }
351 |   };
352 | 
353 |   auto begin() -> iterator { return iterator(this); };
354 |   auto end() -> iterator { return iterator(this, true); };
355 | };
356 | } // namespace csv
357 | } // namespace aria
358 | #endif
359 | 


--------------------------------------------------------------------------------
/test/.clangd:
--------------------------------------------------------------------------------
1 | CompileFlags:
2 |   CompilationDatabase: out
3 | 
4 | Diagnostics:
5 |   ClangTidy:
6 |     Remove:
7 |       - modernize-use-trailing-return-type
8 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | project(CSVParser)
 3 | 
 4 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 5 | add_definitions(-DTEST_DATA_DIR="${CMAKE_CURRENT_SOURCE_DIR}/data")
 6 | 
 7 | # Find or fetch GTest
 8 | cmake_policy(SET CMP0135 NEW)
 9 | include(FetchContent)
10 | FetchContent_Declare(
11 |   googletest
12 |   URL https://github.com/google/googletest/archive/refs/tags/v1.12.0.zip
13 | )
14 | FetchContent_MakeAvailable(googletest)
15 | 
16 | # Add test executable
17 | add_executable(parser_test parser_test.cpp)
18 | target_link_libraries(parser_test PRIVATE gtest_main)
19 | 


--------------------------------------------------------------------------------
/test/data/bom_empty.csv:
--------------------------------------------------------------------------------
1 | ﻿


--------------------------------------------------------------------------------
/test/data/bom_simple.csv:
--------------------------------------------------------------------------------
1 | ﻿a,b,c
2 | 1,2,3
3 | 


--------------------------------------------------------------------------------
/test/data/comma_in_quotes.csv:
--------------------------------------------------------------------------------
1 | first,last,address,city,zip
2 | John,Doe,120 any st.,"Anytown, WW",08123


--------------------------------------------------------------------------------
/test/data/delimiter.csv:
--------------------------------------------------------------------------------
1 | a;b;c
2 | 1;2;3
3 | 4;5;,
4 | 


--------------------------------------------------------------------------------
/test/data/empty.csv:
--------------------------------------------------------------------------------
1 | a,b,c
2 | 1,"",""
3 | 2,3,4


--------------------------------------------------------------------------------
/test/data/emptyUnquoted.csv:
--------------------------------------------------------------------------------
1 | a,b,c
2 | 1,,
3 | 2,3,4


--------------------------------------------------------------------------------
/test/data/empty_crlf.csv:
--------------------------------------------------------------------------------
1 | a,b,c
2 | 1,"",""
3 | 2,3,4


--------------------------------------------------------------------------------
/test/data/empty_file.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AriaFallah/csv-parser/43961a918f150088dd0c288b0c9c551e0be795f8/test/data/empty_file.csv


--------------------------------------------------------------------------------
/test/data/escaped_quotes.csv:
--------------------------------------------------------------------------------
1 | a,b
2 | 1,"ha ""ha"" ha"
3 | 3,4
4 | 


--------------------------------------------------------------------------------
/test/data/json.csv:
--------------------------------------------------------------------------------
1 | key,val
2 | 1,"{""type"": ""Point"", ""coordinates"": [102.0, 0.5]}"
3 | 


--------------------------------------------------------------------------------
/test/data/newlines.csv:
--------------------------------------------------------------------------------
1 | a,b,c
2 | 1,2,3
3 | "Once upon 
4 | a time",5,6
5 | 7,8,9
6 | 


--------------------------------------------------------------------------------
/test/data/newlines_crlf.csv:
--------------------------------------------------------------------------------
1 | a,b,c
2 | 1,2,3
3 | "Once upon 
4 | a time",5,6
5 | 7,8,9
6 | 


--------------------------------------------------------------------------------
/test/data/quote.csv:
--------------------------------------------------------------------------------
1 | '1, 2, 3','4, 5, 6','
2 | 7
3 | 8
4 | 9'
5 | 


--------------------------------------------------------------------------------
/test/data/quotes_and_newlines.csv:
--------------------------------------------------------------------------------
1 | a,b
2 | 1,"ha 
3 | ""ha"" 
4 | ha"
5 | 3,4
6 | 


--------------------------------------------------------------------------------
/test/data/simple.csv:
--------------------------------------------------------------------------------
1 | a,b,c
2 | 1,2,3
3 | 


--------------------------------------------------------------------------------
/test/data/simple_crlf.csv:
--------------------------------------------------------------------------------
1 | a,b,c
2 | 1,2,3
3 | 


--------------------------------------------------------------------------------
/test/data/terminator.csv:
--------------------------------------------------------------------------------
1 | a,b,c;1,2,3;4,5,6
2 | 


--------------------------------------------------------------------------------
/test/data/utf8.csv:
--------------------------------------------------------------------------------
1 | a,b,c
2 | 1,2,3
3 | 4,5,ʤ


--------------------------------------------------------------------------------
/test/parser_test.cpp:
--------------------------------------------------------------------------------
  1 | #include "../parser.hpp"
  2 | #include <fstream>
  3 | #include <gtest/gtest.h>
  4 | 
  5 | using namespace aria::csv;
  6 | 
  7 | auto read_all(CsvParser &p) -> CSV {
  8 |   CSV csv;
  9 |   for (const auto &row : p) {
 10 |     csv.push_back(row);
 11 |   }
 12 |   return csv;
 13 | }
 14 | 
 15 | TEST(CsvParserTest, CommaInQuotes) {
 16 |   std::ifstream f(TEST_DATA_DIR "/comma_in_quotes.csv");
 17 |   CsvParser parser(f);
 18 |   CSV expected = {{"first", "last", "address", "city", "zip"},
 19 |                   {"John", "Doe", "120 any st.", "Anytown, WW", "08123"}};
 20 |   EXPECT_EQ(read_all(parser), expected);
 21 | }
 22 | 
 23 | TEST(CsvParserTest, Empty) {
 24 |   std::ifstream f(TEST_DATA_DIR "/empty.csv");
 25 |   CsvParser parser(f);
 26 |   CSV expected = {{"a", "b", "c"}, {"1", "", ""}, {"2", "3", "4"}};
 27 |   EXPECT_EQ(read_all(parser), expected);
 28 | }
 29 | 
 30 | TEST(CsvParserTest, EmptyUnquoted) {
 31 |   std::ifstream f(TEST_DATA_DIR "/emptyUnquoted.csv");
 32 |   CsvParser parser(f);
 33 |   CSV expected = {{"a", "b", "c"}, {"1", "", ""}, {"2", "3", "4"}};
 34 |   EXPECT_EQ(read_all(parser), expected);
 35 | }
 36 | 
 37 | TEST(CsvParserTest, EmptyCrlf) {
 38 |   std::ifstream f(TEST_DATA_DIR "/empty_crlf.csv");
 39 |   CsvParser parser(f);
 40 |   CSV expected = {{"a", "b", "c"}, {"1", "", ""}, {"2", "3", "4"}};
 41 |   EXPECT_EQ(read_all(parser), expected);
 42 | }
 43 | 
 44 | TEST(CsvParserTest, EscapedQuotes) {
 45 |   std::ifstream f(TEST_DATA_DIR "/escaped_quotes.csv");
 46 |   CsvParser parser(f);
 47 |   CSV expected = {{"a", "b"}, {"1", R"(ha "ha" ha)"}, {"3", "4"}};
 48 |   EXPECT_EQ(read_all(parser), expected);
 49 | }
 50 | 
 51 | TEST(CsvParserTest, Json) {
 52 |   std::ifstream f(TEST_DATA_DIR "/json.csv");
 53 |   CsvParser parser(f);
 54 |   CSV expected = {{"key", "val"},
 55 |                   {"1", R"({"type": "Point", "coordinates": [102.0, 0.5]})"}};
 56 |   EXPECT_EQ(read_all(parser), expected);
 57 | }
 58 | 
 59 | TEST(CsvParserTest, Newlines) {
 60 |   std::ifstream f(TEST_DATA_DIR "/newlines.csv");
 61 |   CsvParser parser(f);
 62 |   CSV expected = {{"a", "b", "c"},
 63 |                   {"1", "2", "3"},
 64 |                   {"Once upon \na time", "5", "6"},
 65 |                   {"7", "8", "9"}};
 66 |   EXPECT_EQ(read_all(parser), expected);
 67 | }
 68 | 
 69 | TEST(CsvParserTest, NewlinesCrlf) {
 70 |   std::ifstream f(TEST_DATA_DIR "/newlines_crlf.csv");
 71 |   CsvParser parser(f);
 72 |   CSV expected = {{"a", "b", "c"},
 73 |                   {"1", "2", "3"},
 74 |                   {"Once upon \r\na time", "5", "6"},
 75 |                   {"7", "8", "9"}};
 76 |   EXPECT_EQ(read_all(parser), expected);
 77 | }
 78 | 
 79 | TEST(CsvParserTest, QuotesAndNewlines) {
 80 |   std::ifstream f(TEST_DATA_DIR "/quotes_and_newlines.csv");
 81 |   CsvParser parser(f);
 82 |   CSV expected = {{"a", "b"}, {"1", "ha \n\"ha\" \nha"}, {"3", "4"}};
 83 |   EXPECT_EQ(read_all(parser), expected);
 84 | }
 85 | 
 86 | TEST(CsvParserTest, Simple) {
 87 |   std::ifstream f(TEST_DATA_DIR "/simple.csv");
 88 |   CsvParser parser(f);
 89 |   CSV expected = {{"a", "b", "c"}, {"1", "2", "3"}};
 90 |   EXPECT_EQ(read_all(parser), expected);
 91 | }
 92 | 
 93 | TEST(CsvParserTest, SimpleCrlf) {
 94 |   std::ifstream f(TEST_DATA_DIR "/simple_crlf.csv");
 95 |   CsvParser parser(f);
 96 |   CSV expected = {{"a", "b", "c"}, {"1", "2", "3"}};
 97 |   EXPECT_EQ(read_all(parser), expected);
 98 | }
 99 | 
100 | TEST(CsvParserTest, Utf8) {
101 |   std::ifstream f(TEST_DATA_DIR "/utf8.csv");
102 |   CsvParser parser(f);
103 |   CSV expected = {{"a", "b", "c"}, {"1", "2", "3"}, {"4", "5", "ʤ"}};
104 |   EXPECT_EQ(read_all(parser), expected);
105 | }
106 | 
107 | TEST(CsvParserTest, DifferentDelimiter) {
108 |   std::ifstream f(TEST_DATA_DIR "/delimiter.csv");
109 |   CsvParser parser = CsvParser(f).delimiter(';');
110 |   CSV expected = {{"a", "b", "c"}, {"1", "2", "3"}, {"4", "5", ","}};
111 |   EXPECT_EQ(read_all(parser), expected);
112 | }
113 | 
114 | TEST(CsvParserTest, DifferentTerminator) {
115 |   std::ifstream f(TEST_DATA_DIR "/terminator.csv");
116 |   CsvParser parser = CsvParser(f).terminator(';');
117 |   CSV expected = {{"a", "b", "c"}, {"1", "2", "3"}, {"4", "5", "6\n"}};
118 |   EXPECT_EQ(read_all(parser), expected);
119 | }
120 | 
121 | TEST(CsvParserTest, DifferentQuote) {
122 |   std::ifstream f(TEST_DATA_DIR "/quote.csv");
123 |   CsvParser parser = CsvParser(f).quote('\'');
124 |   CSV expected = {{"1, 2, 3", "4, 5, 6", "\n7\n8\n9"}};
125 |   EXPECT_EQ(read_all(parser), expected);
126 | }
127 | 
128 | TEST(CsvParserTest, BomSimple) {
129 |   std::ifstream f(TEST_DATA_DIR "/bom_simple.csv");
130 |   CsvParser parser(f);
131 |   CSV expected = {{"a", "b", "c"}, {"1", "2", "3"}};
132 |   EXPECT_EQ(read_all(parser), expected);
133 | }
134 | 
135 | TEST(CsvParserTest, BomEmpty) {
136 |   std::ifstream f(TEST_DATA_DIR "/bom_empty.csv");
137 |   CsvParser parser(f);
138 |   CSV expected = {};
139 |   EXPECT_EQ(read_all(parser), expected);
140 | }
141 | 
142 | TEST(CsvParserTest, EmptyFile) {
143 |   std::ifstream f(TEST_DATA_DIR "/empty_file.csv");
144 |   CsvParser parser(f);
145 |   CSV expected = {};
146 |   EXPECT_EQ(read_all(parser), expected);
147 | }
148 | 


--------------------------------------------------------------------------------