├── single_include_test
    ├── my_header.hpp
    ├── README.md
    ├── CMakeLists.txt
    ├── file1.cpp
    └── file2.cpp
├── python
    ├── csvpy
    │   ├── __init__.py
    │   └── DictReader.py
    ├── examples
    │   ├── PyDemo2.py
    │   ├── PyDemo7.py
    │   ├── PyDemo6.py
    │   ├── PyDemo5.py
    │   ├── PyDemo1.py
    │   ├── PyDemo3.py
    │   └── PyDemo4.py
    ├── CMakeLists.txt
    └── csvpy.cpp
├── codecov.yml
├── tests
    ├── main.cpp
    ├── test_csv_delimeter.cpp
    ├── test_guess_csv.cpp
    ├── CMakeLists.txt
    ├── shared
    │   └── float_test_cases.hpp
    ├── test_csv_ranges.cpp
    ├── test_round_trip.cpp
    ├── test_csv_format.cpp
    ├── test_csv_row.cpp
    ├── test_csv_field_array.cpp
    ├── test_csv_stat.cpp
    ├── test_csv_row_json.cpp
    ├── test_csv_iterator.cpp
    ├── test_write_csv.cpp
    ├── test_data_type.cpp
    ├── test_read_csv_file.cpp
    ├── test_csv_field.cpp
    └── test_raw_csv_data.cpp
├── .gitmodules
├── cpp.hint
├── .gitattributes
├── docs
    └── source
    │   ├── variable_row_lengths.md
    │   ├── scientific_notation.md
    │   └── Doxy.md
├── programs
    ├── csv_bench.py
    ├── csv_info.cpp
    ├── round_trip.cpp
    ├── csv_stats.cpp
    ├── csv_generator.cpp
    ├── csv_guess_bench.cpp
    ├── csv_bench.cpp
    ├── CMakeLists.txt
    └── data_type_bench.cpp
├── include
    ├── internal
    │   ├── CMakeLists.txt
    │   ├── col_names.cpp
    │   ├── col_names.hpp
    │   ├── csv_utility.hpp
    │   ├── csv_stat.hpp
    │   ├── csv_reader_iterator.cpp
    │   ├── csv_utility.cpp
    │   ├── csv_format.cpp
    │   ├── csv_format.hpp
    │   ├── common.hpp
    │   ├── csv_row_json.cpp
    │   ├── csv_reader.hpp
    │   ├── csv_row.cpp
    │   ├── csv_stat.cpp
    │   ├── basic_csv_parser.cpp
    │   └── csv_reader.cpp
    └── csv.hpp
├── CMakeSettings.json
├── LICENSE
├── .gitignore
├── Makefile
├── .travis.yml
├── .github
    └── workflows
    │   └── cmake-multi-platform.yml
├── CMakeLists.txt
└── single_header.py


/single_include_test/my_header.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "csv.hpp"


--------------------------------------------------------------------------------
/python/csvpy/__init__.py:
--------------------------------------------------------------------------------
1 | from .csvpy import Reader
2 | from .DictReader import DictReader


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | ignore:
2 |     - "include/external"
3 |     - "tests"
4 | coverage:
5 | 	status:
6 | 		project:
7 | 			default:
8 | 				target: 95%


--------------------------------------------------------------------------------
/tests/main.cpp:
--------------------------------------------------------------------------------
1 | #define CATCH_CONFIG_MAIN
2 | 
3 | // For Catch + MSVC
4 | #define _SILENCE_CXX17_UNCAUGHT_EXCEPTION_DEPRECATION_WARNING
5 | 
6 | #include <catch2/catch_all.hpp>


--------------------------------------------------------------------------------
/single_include_test/README.md:
--------------------------------------------------------------------------------
1 | # Purpose
2 | 
3 | The purpose of this directory is to make sure that the single header
4 | `csv.hpp` file does not cause compile errors when `#include`d from multiple
5 | .cpp files.


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tests/data"]
2 | 	path = tests/data
3 | 	url = https://github.com/vincentlaucsb/csv-data.git
4 | [submodule "python/pybind11"]
5 | 	path = python/pybind11
6 | 	url = https://github.com/pybind/pybind11.git
7 | 


--------------------------------------------------------------------------------
/cpp.hint:
--------------------------------------------------------------------------------
1 | // Hint files help the Visual Studio IDE interpret Visual C++ identifiers
2 | // such as names of functions and macros.
3 | // For more information see https://go.microsoft.com/fwlink/?linkid=865984
4 | #define CONSTEXPR
5 | 


--------------------------------------------------------------------------------
/python/examples/PyDemo2.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import csvpy
 3 | 
 4 | path = Path(__file__).parent.parent / 'data' / '2015_StateDepartment.csv'
 5 | reader = csvpy.Reader(str(path))
 6 | 
 7 | for row in reader:
 8 |     row['Year'].get_int()
 9 |     # row[0].get_int()
10 | 


--------------------------------------------------------------------------------
/python/examples/PyDemo7.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import csvpy
 3 | 
 4 | path = Path(__file__).parent.parent / 'data' / '2015_StateDepartment.csv'
 5 | 
 6 | info = csvpy.get_file_info(str(path))
 7 | 
 8 | print(info.filename)
 9 | print(info.col_names)
10 | print(info.delim)
11 | print(info.n_rows)
12 | print(info.n_cols)


--------------------------------------------------------------------------------
/single_include_test/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # The purpose of this executable is to make sure it successfully compiles
2 | add_executable(single_include_test "")
3 | target_sources(single_include_test
4 |     PRIVATE
5 |         file1.cpp
6 |         file2.cpp
7 | )
8 | target_link_libraries(single_include_test PRIVATE Threads::Threads)
9 | 


--------------------------------------------------------------------------------
/python/examples/PyDemo6.py:
--------------------------------------------------------------------------------
 1 | import csvpy
 2 | 
 3 | format = csvpy.Format().delimiter(',')
 4 | reader = csvpy.parse(
 5 |     'Name, Age\nHussein Sarea, 22\nMoataz Sarea, 21',
 6 |     format
 7 | )
 8 | # reader = csvpy.parse_no_header(
 9 | #     'Name, Age\nHussein Sarea, 22\nMoataz Sarea, 21',
10 | # )
11 | for r in reader:
12 |     print(r[1].get_str())
13 | 


--------------------------------------------------------------------------------
/python/examples/PyDemo5.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import csvpy
 3 | 
 4 | path = Path(__file__).parent.parent / 'data' / '2015_StateDepartment.csv'
 5 | format = csvpy.Format()
 6 | format.delimiter(',').quote('"').header_row(2)
 7 | reader = csvpy.Reader(str(path), format)
 8 | for row in reader:
 9 |     # Do stuff with rows here
10 |     print(row[1].get_str())
11 | 


--------------------------------------------------------------------------------
/python/examples/PyDemo1.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import csvpy
 3 | 
 4 | path = Path(__file__).parent.parent / 'data' / '2015_StateDepartment.csv'
 5 | reader = csvpy.Reader(str(path))
 6 | 
 7 | for row in reader:
 8 |     for field in row:
 9 |         # field.get_int()
10 |         # field.get_float()
11 |         # field.get_double()
12 |         # field.get_sv()
13 |         print(field.get_str())
14 | 
15 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/docs/source/variable_row_lengths.md:
--------------------------------------------------------------------------------
 1 | # Dealing with Variable Length CSV Rows
 2 | 
 3 | `csv::CSVReader` generally assumes that most rows in a CSV are of the same length.
 4 | If your CSV has important data stored in rows which may not be the same length
 5 | as the others, then you may want to create your own subclass of CSVReader and
 6 | override `bad_row_handler`.
 7 | 
 8 | ## Examples
 9 |  * csv::CSVReader::bad_row_handler
10 |  * csv::internals::CSVGuesser::Guesser::bad_row_handler()


--------------------------------------------------------------------------------
/python/examples/PyDemo3.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import csvpy
 3 | 
 4 | path = Path(__file__).parent.parent / 'data' / '2015_StateDepartment.csv'
 5 | reader = csvpy.Reader(str(path))
 6 | 
 7 | for row in reader:
 8 |     if row['Year'].is_int():
 9 |         row['Year'].get_int()
10 |     elif row['Year'].is_float():
11 |         row['Year'].get_float()
12 |     elif row['Year'].is_str():
13 |         row['Year'].get_str()
14 |     elif row['Year'].is_null():
15 |         pass
16 | 


--------------------------------------------------------------------------------
/python/examples/PyDemo4.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import csvpy
 3 | 
 4 | path = Path(__file__).parent.parent / 'data' / '2015_StateDepartment.csv'
 5 | reader = csvpy.Reader(str(path))
 6 | 
 7 | # for row in reader:
 8 | #     print(row.to_json())
 9 | #     print(row.to_json_array())
10 | 
11 | for row in reader:
12 |     # You can pass in a list of column names to slice or rearrange the outputted JSON
13 |     print(row.to_json(['Entity Type', 'Year']))
14 |     print(row.to_json_array(['Year', 'Entity Type']))


--------------------------------------------------------------------------------
/programs/csv_bench.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import csv
 3 | 
 4 | parser = argparse.ArgumentParser(description='Count the number of lines in a CSV')
 5 | parser.add_argument('file', type=str, nargs=1,
 6 |                     help='File to parse')
 7 | parser.add_argument('encoding', nargs='?', type=str, default='utf-8',
 8 |                     help='File encoding')
 9 | 
10 | args = parser.parse_args()
11 | file = args.file[0]
12 | enc = args.encoding
13 | 
14 | j = 0
15 | with open(file, 'r', encoding=enc) as csv_file:
16 |     reader = csv.reader(csv_file)
17 |     for i in reader:
18 |         j += 1
19 |         
20 | print(j)


--------------------------------------------------------------------------------
/programs/csv_info.cpp:
--------------------------------------------------------------------------------
 1 | #include "csv.hpp"
 2 | #include <iostream>
 3 | 
 4 | int main(int argc, char** argv) {
 5 |     using namespace csv;
 6 | 
 7 |     if (argc < 2) {
 8 |         std::cout << "Usage: " << argv[0] << " [file]" << std::endl;
 9 |         exit(1);
10 |     }
11 | 
12 |     std::string file = argv[1];
13 |     auto info = get_file_info(file);
14 | 
15 |     std::cout << file << std::endl
16 |         << "Columns: " << internals::format_row(info.col_names, ", ")
17 |         << "Dimensions: " << info.n_rows << " rows x " << info.n_cols << " columns" << std::endl
18 |         << "Delimiter: " << info.delim << std::endl;
19 | 
20 |     return 0;
21 | }


--------------------------------------------------------------------------------
/single_include_test/file1.cpp:
--------------------------------------------------------------------------------
 1 | #include "my_header.hpp"
 2 | #include <iostream>
 3 | 
 4 | int foobar(int argc, char** argv) {
 5 |     using namespace csv;
 6 | 
 7 |     if (argc < 2) {
 8 |         std::cout << "Usage: " << argv[0] << " [file]" << std::endl;
 9 |         exit(1);
10 |     }
11 | 
12 |     std::string file = argv[1];
13 |     auto info = get_file_info(file);
14 | 
15 |     std::cout << file << std::endl
16 |         << "Columns: " << internals::format_row(info.col_names, ", ")
17 |         << "Dimensions: " << info.n_rows << " rows x " << info.n_cols << " columns" << std::endl
18 |         << "Delimiter: " << info.delim << std::endl;
19 | 
20 |     return 0;
21 | }


--------------------------------------------------------------------------------
/include/internal/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(csv STATIC "")
 2 | 
 3 | target_sources(csv
 4 | 	PRIVATE
 5 | 		basic_csv_parser.hpp
 6 | 		basic_csv_parser.cpp
 7 | 		col_names.cpp
 8 | 		col_names.hpp
 9 | 		common.hpp
10 | 		csv_format.hpp
11 | 		csv_format.cpp
12 | 		csv_reader.hpp
13 | 		csv_reader.cpp
14 | 		csv_reader_iterator.cpp
15 | 		csv_row.hpp
16 | 		csv_row.cpp
17 | 		csv_row_json.cpp
18 | 		csv_stat.cpp
19 | 		csv_stat.hpp
20 | 		csv_utility.cpp
21 | 		csv_utility.hpp
22 | 		csv_writer.hpp
23 | 		"data_type.hpp"
24 | 		)
25 | 
26 | set_target_properties(csv PROPERTIES LINKER_LANGUAGE CXX)
27 | target_link_libraries(csv PRIVATE Threads::Threads)
28 | target_include_directories(csv INTERFACE ../)
29 | 


--------------------------------------------------------------------------------
/programs/round_trip.cpp:
--------------------------------------------------------------------------------
 1 | #include "csv.hpp"
 2 | #include <iostream>
 3 | 
 4 | int main(int argc, char** argv) {
 5 |     using namespace csv;
 6 | 
 7 |     if (argc < 3) {
 8 |         std::cout << "Usage: " << argv[0] << " [file] [out]" << std::endl;
 9 |         exit(1);
10 |     }
11 | 
12 |     std::string file = argv[1];
13 |     std::string out = argv[2];
14 | 
15 |     std::ofstream outfile(out);
16 |     auto writer = make_csv_writer(outfile);    
17 | 
18 |     CSVFormat format;
19 |     format.variable_columns(true);
20 |     CSVReader reader(file, format);
21 |     writer << reader.get_col_names();
22 | 
23 |     for (auto& row: reader) {
24 |         writer << std::vector<std::string>(row);
25 |     }
26 | 
27 |     return 0;
28 | }


--------------------------------------------------------------------------------
/docs/source/scientific_notation.md:
--------------------------------------------------------------------------------
 1 | # Scientific Notation Parsing
 2 | 
 3 | This library has support for parsing scientific notation through `csv::internals::data_type()`,
 4 | which is in turned called by `csv::CSVField::get()` when used with a floating point value type
 5 | as the template parameter. Malformed scientific notation will be interpreted by this 
 6 | library as a regular string.
 7 | 
 8 | ## Examples
 9 | \snippet tests/test_data_type.cpp Parse Scientific Notation
10 | 
11 | ## Supported Flavors
12 | 
13 | Many different variations of E-notation are supported, as long as there isn't a whitespace
14 | between E and the successive exponent. As seen below, the `+` sign is optional, and any number of 
15 | zeroes is accepted.
16 | 
17 | \snippet tests/test_data_type.cpp Scientific Notation Flavors


--------------------------------------------------------------------------------
/tests/test_csv_delimeter.cpp:
--------------------------------------------------------------------------------
 1 | #include "csv.hpp"
 2 | #include <catch2/catch_all.hpp>
 3 | #include <cmath>
 4 | #include <iostream>
 5 | 
 6 | TEST_CASE("Test delim from file", "[test_csv_reader_get_format_get_delim_from_file]") {
 7 |     csv::CSVReader reader("./tests/data/fake_data/delimeter.csv");
 8 |     char delim = reader.get_format().get_delim();
 9 |     REQUIRE(delim == ';');
10 | }
11 | 
12 | TEST_CASE("Test delim from string", "[test_csv_reader_get_format_get_delim_from_string]") {
13 |     std::ifstream file_stream("./tests/data/fake_data/delimeter.csv");
14 |     std::string csv_data((std::istreambuf_iterator<char>(file_stream)), std::istreambuf_iterator<char>());
15 |     std::stringstream ss(csv_data);
16 | 
17 |     csv::CSVReader reader(ss);
18 |     char delim = reader.get_format().get_delim();
19 |     REQUIRE(delim == ';');
20 | }
21 | 


--------------------------------------------------------------------------------
/programs/csv_stats.cpp:
--------------------------------------------------------------------------------
 1 | #include "csv.hpp"
 2 | 
 3 | int main(int argc, char** argv) {
 4 |     using namespace csv;
 5 | 
 6 |     if (argc < 2) {
 7 |         std::cout << "Usage: " << argv[0] << " [file]" << std::endl;
 8 |         exit(1);
 9 |     }
10 | 
11 |     std::string filename = argv[1];
12 |     CSVStat stats(filename);
13 | 
14 |     auto col_names = stats.get_col_names();
15 |     auto min = stats.get_mins(), max = stats.get_maxes(),
16 |         means = stats.get_mean(), vars = stats.get_variance();
17 | 
18 |     for (size_t i = 0; i < col_names.size(); i++) {
19 |         std::cout << col_names[i] << std::endl
20 |             << "Min: " << min[i] << std::endl
21 |             << "Max: " << max[i] << std::endl
22 |             << "Mean: " << means[i] << std::endl
23 |             << "Var: " << vars[i] << std::endl;
24 |     }
25 | 
26 |     return 0;
27 | }


--------------------------------------------------------------------------------
/single_include_test/file2.cpp:
--------------------------------------------------------------------------------
 1 | #include "my_header.hpp"
 2 | 
 3 | int main(int argc, char** argv) {
 4 |     using namespace csv;
 5 | 
 6 |     if (argc < 2) {
 7 |         std::cout << "Usage: " << argv[0] << " [file]" << std::endl;
 8 |         exit(1);
 9 |     }
10 | 
11 |     std::string filename = argv[1];
12 |     CSVStat stats(filename);
13 | 
14 |     auto col_names = stats.get_col_names();
15 |     auto min = stats.get_mins(), max = stats.get_maxes(),
16 |         means = stats.get_mean(), vars = stats.get_variance();
17 | 
18 |     for (size_t i = 0; i < col_names.size(); i++) {
19 |         std::cout << col_names[i] << std::endl
20 |             << "Min: " << min[i] << std::endl
21 |             << "Max: " << max[i] << std::endl
22 |             << "Mean: " << means[i] << std::endl
23 |             << "Var: " << vars[i] << std::endl;
24 |     }
25 | 
26 |     return 0;
27 | }


--------------------------------------------------------------------------------
/CMakeSettings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "configurations": [
 3 |     {
 4 |       "name": "x64-Release",
 5 |       "generator": "Ninja",
 6 |       "configurationType": "RelWithDebInfo",
 7 |       "inheritEnvironments": [
 8 |         "msvc_x64_x64"
 9 |       ],
10 |       "buildRoot": "${projectDir}\\build\\${name}",
11 |       "installRoot": "${projectDir}\\install\\${name}",
12 |       "cmakeCommandArgs": "",
13 |       "buildCommandArgs": "-v",
14 |       "ctestCommandArgs": ""
15 |     },
16 |     {
17 |       "name": "x64-Debug",
18 |       "generator": "Ninja",
19 |       "configurationType": "Debug",
20 |       "inheritEnvironments": [
21 |         "msvc_x64_x64"
22 |       ],
23 |       "buildRoot": "${projectDir}\\build\\${name}",
24 |       "installRoot": "{projectDir}\\install\\${name}",
25 |       "cmakeCommandArgs": "",
26 |       "buildCommandArgs": "-v",
27 |       "ctestCommandArgs": ""
28 |     }
29 |   ]
30 | }


--------------------------------------------------------------------------------
/python/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (EXISTS ${CMAKE_CURRENT_LIST_DIR}/pybind11)
 2 |     add_subdirectory(pybind11)
 3 | 
 4 |     if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
 5 |         set(PYBIND11_CPP_STANDARD /std:c++17)
 6 |     else()
 7 |         set(PYBIND11_CPP_STANDARD -std=c++1z)
 8 |     endif()
 9 | 
10 |     set(CSVPY_SOURCES 
11 |         ${CMAKE_CURRENT_LIST_DIR}/csvpy.cpp
12 |     )
13 | 
14 |     pybind11_add_module(csvpy ${CSVPY_SOURCES})
15 |     target_link_libraries(csvpy PUBLIC csv)
16 | 
17 |     get_property(csvpySuffix TARGET csvpy PROPERTY SUFFIX)
18 |     add_custom_command(TARGET csvpy POST_BUILD
19 |                        COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:csvpy>
20 |                        ${CMAKE_CURRENT_LIST_DIR}/csvpy/csvpy${csvpySuffix})
21 | else()
22 |     message("Cannot build Python bindings because pybind11 submodule was not found. Please run ""git submodule update --recursive"".")
23 | endif()


--------------------------------------------------------------------------------
/programs/csv_generator.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <array>
 3 | #include <charconv>
 4 | #include <random>
 5 | 
 6 | #include "csv.hpp"
 7 | 
 8 | int main(int argc, char** argv) {
 9 |     using namespace csv;
10 |     std::uniform_real_distribution<double> d(1, 1000000);
11 |     std::mt19937 gen;
12 |     gen.seed(time(0));
13 | 
14 |     if (argc < 2) {
15 |         std::cout << "Usage: " << argv[0] << " [file]" << std::endl;
16 |         exit(1);
17 |     }
18 | 
19 |     std::string file = argv[1];
20 |     std::ofstream outfile(file);
21 | 
22 |     CSVWriter<std::ofstream> writer(outfile);
23 |     writer << std::vector<std::string>({"A", "B", "C", "D", "E"});
24 | 
25 |     for (size_t i = 0; i < 1000000; i++) {
26 |         writer << std::array<double, 5>({
27 |             d(gen),
28 |             d(gen),
29 |             d(gen),
30 |             d(gen),
31 |             d(gen)
32 |         });
33 |     }
34 | 
35 |     return 0;
36 | }


--------------------------------------------------------------------------------
/python/csvpy/DictReader.py:
--------------------------------------------------------------------------------
 1 | from .csvpy import Reader, DataType
 2 | 
 3 | class DictReader:
 4 |     def __init__(self, filename):
 5 |         self._reader = Reader(filename)
 6 |         self._csvIterator = self._reader.__iter__()
 7 | 
 8 |     def __iter__(self):
 9 |         return self
10 | 
11 |     def __next__(self):
12 |         ret = dict()
13 |         next_row = self._csvIterator.__next__()
14 | 
15 |         for col_name in next_row.get_col_names():
16 |             field = next_row[col_name]
17 |             field_type = field.type()
18 |             value = None
19 | 
20 |             if field_type == DataType.CSV_STRING:
21 |                 value = field.get_str()
22 |             elif field_type >= DataType.CSV_INT8 and field_type <= DataType.CSV_INT64:
23 |                 value = field.get_int()
24 |             elif field_type == DataType.CSV_DOUBLE:
25 |                 value = field.get_double()
26 | 
27 |             ret[col_name] = value
28 | 
29 |         return ret


--------------------------------------------------------------------------------
/include/internal/col_names.cpp:
--------------------------------------------------------------------------------
 1 | #include "col_names.hpp"
 2 | 
 3 | namespace csv {
 4 |     namespace internals {
 5 |         CSV_INLINE std::vector<std::string> ColNames::get_col_names() const {
 6 |             return this->col_names;
 7 |         }
 8 | 
 9 |         CSV_INLINE void ColNames::set_col_names(const std::vector<std::string>& cnames) {
10 |             this->col_names = cnames;
11 | 
12 |             for (size_t i = 0; i < cnames.size(); i++) {
13 |                 this->col_pos[cnames[i]] = i;
14 |             }
15 |         }
16 | 
17 |         CSV_INLINE int ColNames::index_of(csv::string_view col_name) const {
18 |             auto pos = this->col_pos.find(col_name.data());
19 |             if (pos != this->col_pos.end())
20 |                 return (int)pos->second;
21 | 
22 |             return CSV_NOT_FOUND;
23 |         }
24 | 
25 |         CSV_INLINE size_t ColNames::size() const noexcept {
26 |             return this->col_names.size();
27 |         }
28 | 
29 |     }
30 | }


--------------------------------------------------------------------------------
/tests/test_guess_csv.cpp:
--------------------------------------------------------------------------------
 1 | /** @file
 2 |  *  Tests for CSV parsing
 3 |  */
 4 | 
 5 | #include <stdio.h> // remove()
 6 | #include <sstream>
 7 | #include <catch2/catch_all.hpp>
 8 | #include "csv.hpp"
 9 | 
10 | using namespace csv;
11 | using std::vector;
12 | using std::string;
13 | 
14 | //
15 | // guess_delim()
16 | //
17 | TEST_CASE("guess_delim() Test - Pipe", "[test_guess_pipe]") {
18 |     CSVGuessResult format = guess_format(
19 |         "./tests/data/real_data/2009PowerStatus.txt");
20 |     REQUIRE(format.delim == '|');
21 |     REQUIRE(format.header_row == 0);
22 | }
23 | 
24 | TEST_CASE("guess_delim() Test - Semi-Colon", "[test_guess_scolon]") {
25 |     CSVGuessResult format = guess_format(
26 |         "./tests/data/real_data/YEAR07_CBSA_NAC3.txt");
27 |     REQUIRE(format.delim == ';');
28 |     REQUIRE(format.header_row == 0);
29 | }
30 | 
31 | TEST_CASE("guess_delim() Test - CSV with Comments", "[test_guess_comment]") {
32 |     CSVGuessResult format = guess_format(
33 |         "./tests/data/fake_data/ints_comments.csv");
34 |     REQUIRE(format.delim == ',');
35 |     REQUIRE(format.header_row == 5);
36 | }


--------------------------------------------------------------------------------
/programs/csv_guess_bench.cpp:
--------------------------------------------------------------------------------
 1 | // Calculate benchmarks for CSV guessing
 2 | 
 3 | #include "csv.hpp"
 4 | #include <chrono>
 5 | #include <iostream>
 6 | #include <sstream>
 7 | 
 8 | int main(int argc, char** argv) {
 9 |     using namespace csv;
10 | 
11 |     if (argc < 2) {
12 |         std::cout << "Usage: " << argv[0] << " [file]" << std::endl;
13 |         exit(1);
14 |     }
15 | 
16 |     std::string filename = argv[1];
17 |     std::vector<double> times = {};
18 |     int trials = 5;
19 | 
20 |     for (int i = 0; i < trials; i++) {
21 |         auto start = std::chrono::system_clock::now();
22 | 
23 |         // This reads just the first 500 kb of a file
24 |         CSVReader reader(filename, CSVFormat::guess_csv());
25 | 
26 |         auto end = std::chrono::system_clock::now();
27 |         std::chrono::duration<double> diff = end - start;
28 |         times.push_back(diff.count());
29 |     }
30 | 
31 |     double avg = 0;
32 |     for (double time: times) {
33 |         avg += time * 1/trials;
34 |     }
35 |     std::cout << "Guessing took: " << avg << " seconds (averaged over " << trials << " trials)" << std::endl;
36 | 
37 |     return 0;
38 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017-2019 Vincent La
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/include/internal/col_names.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <memory>
 3 | #include <unordered_map>
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #include "common.hpp"
 8 | 
 9 | namespace csv {
10 |     namespace internals {
11 |         struct ColNames;
12 |         using ColNamesPtr = std::shared_ptr<ColNames>;
13 | 
14 |         /** @struct ColNames
15 |              *  A data structure for handling column name information.
16 |              *
17 |              *  These are created by CSVReader and passed (via smart pointer)
18 |              *  to CSVRow objects it creates, thus
19 |              *  allowing for indexing by column name.
20 |              */
21 |         struct ColNames {
22 |         public:
23 |             ColNames() = default;
24 |             ColNames(const std::vector<std::string>& names) {
25 |                 set_col_names(names);
26 |             }
27 | 
28 |             std::vector<std::string> get_col_names() const;
29 |             void set_col_names(const std::vector<std::string>&);
30 |             int index_of(csv::string_view) const;
31 | 
32 |             bool empty() const noexcept { return this->col_names.empty(); }
33 |             size_t size() const noexcept;
34 | 
35 |         private:
36 |             std::vector<std::string> col_names;
37 |             std::unordered_map<std::string, size_t> col_pos;
38 |         };
39 |     }
40 | }


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(FetchContent)
 2 | 
 3 | FetchContent_Declare(
 4 |   Catch2
 5 |   GIT_REPOSITORY https://github.com/catchorg/Catch2.git
 6 |   GIT_TAG v3.6.0
 7 | ) 
 8 | 
 9 | FetchContent_MakeAvailable(Catch2)
10 | 
11 | add_executable(csv_test "")
12 | target_sources(csv_test
13 |     PRIVATE
14 |         ${CSV_INCLUDE_DIR}/csv.hpp
15 |         main.cpp
16 |         test_csv_field.cpp
17 |         test_csv_field_array.cpp
18 |         test_csv_format.cpp
19 |         test_csv_iterator.cpp
20 |         test_csv_row.cpp
21 |         test_csv_row_json.cpp
22 |         test_csv_stat.cpp
23 |         test_guess_csv.cpp
24 |         test_read_csv.cpp
25 |         test_read_csv_file.cpp
26 |         test_write_csv.cpp
27 |         test_data_type.cpp
28 |         test_raw_csv_data.cpp
29 |         test_round_trip.cpp
30 |         test_csv_delimeter.cpp
31 |         test_csv_ranges.cpp
32 |     )
33 | target_link_libraries(csv_test csv)
34 | target_link_libraries(csv_test Catch2::Catch2WithMain)
35 | 
36 | if(MSVC)
37 |     # Workaround to enable debugging unit tests in Visual Studio
38 |     add_custom_command(
39 |         TARGET csv_test POST_BUILD
40 |         COMMAND ${CMAKE_COMMAND} -E copy_directory
41 |         ${CSV_TEST_DIR}/data $<TARGET_FILE_DIR:csv_test>/tests/data
42 |     )
43 | endif()
44 | 
45 | add_test(
46 |     NAME test
47 |     COMMAND csv_test
48 |     WORKING_DIRECTORY ${CSV_ROOT_DIR}
49 | )


--------------------------------------------------------------------------------
/tests/shared/float_test_cases.hpp:
--------------------------------------------------------------------------------
 1 | #include <string>
 2 | #include <tuple>
 3 | 
 4 | using std::make_tuple;
 5 | 
 6 | namespace csv_test {
 7 |     static const std::initializer_list<std::tuple<std::string, long double>> FLOAT_TEST_CASES = {
 8 |         make_tuple("3.14", 3.14L),
 9 |         make_tuple("+3.14", 3.14L),
10 |         make_tuple("       -3.14            ", -3.14L),
11 |         make_tuple("2.71828", 2.71828L),
12 | 
13 |         // Test uniform distribution values
14 |         make_tuple("0.12", 0.12L),
15 |         make_tuple("0.334", 0.334L),
16 |         make_tuple("0.625", 0.625L),
17 |         make_tuple("0.666666", 0.666666L),
18 |         make_tuple("0.69", 0.69L),
19 | 
20 |         // Test negative values between 0 and 1
21 |         make_tuple("-0.12", -0.12L),
22 |         make_tuple("-0.334", -0.334L),
23 |         make_tuple("-0.625", -0.625L),
24 |         make_tuple("-0.666666", -0.666666L),
25 |         make_tuple("-0.69", -0.69L),
26 | 
27 |         // Larger numbers
28 |         make_tuple("1000.00", 1000L),
29 |         make_tuple("1000000.00", 1000000L),
30 |         make_tuple("9999999.99", 9999999.99L),
31 |         make_tuple("99999999.999", 99999999.999L),
32 | 
33 |         make_tuple("-1000.00", -1000L),
34 |         make_tuple("-1000000.00", -1000000L),
35 |         make_tuple("-9999999.99", -9999999.99L),
36 |         make_tuple("-99999999.999", -99999999.999L),
37 |     };
38 | }


--------------------------------------------------------------------------------
/programs/csv_bench.cpp:
--------------------------------------------------------------------------------
 1 | // Calculate benchmarks for CSV parser
 2 | 
 3 | #include "csv.hpp"
 4 | #include <chrono>
 5 | #include <iostream>
 6 | #include <sstream>
 7 | 
 8 | int main(int argc, char** argv) {
 9 |     using namespace csv;
10 | 
11 |     if (argc < 2) {
12 |         std::cout << "Usage: " << argv[0] << " [file]" << std::endl;
13 |         exit(1);
14 |     }
15 | 
16 |     // Benchmark 1: File IO + Parsing
17 |     std::string filename = argv[1];
18 |     auto start = std::chrono::system_clock::now();
19 |     auto info = get_file_info(filename);
20 |     auto end = std::chrono::system_clock::now();
21 |     std::chrono::duration<double> diff = end - start;
22 | 
23 |     std::cout << "Parsing took (including disk IO): " << diff.count() << std::endl;
24 |     std::cout << "Dimensions: " << info.n_rows << " rows x " << info.n_cols << " columns " << std::endl;
25 |     std::cout << "Columns: ";
26 |     for (auto& col : info.col_names) {
27 |         std::cout << " " << col;
28 |     }
29 |     std::cout << std::endl;
30 | 
31 |     // Benchmark 2: Parsing Only
32 |     /*
33 |     std::ifstream csv(filename);
34 |     std::stringstream buffer;
35 |     buffer << csv.rdbuf();
36 | 
37 |     auto csv_str = buffer.str();
38 | 
39 |     start = std::chrono::system_clock::now();
40 |     parse(csv_str);
41 |     end = std::chrono::system_clock::now();
42 |     diff = end - start;
43 | 
44 |     std::cout << "Parsing took: " << diff.count() << std::endl;
45 |     */
46 | 
47 |     return 0;
48 | }


--------------------------------------------------------------------------------
/include/internal/csv_utility.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "common.hpp"
 3 | #include "csv_format.hpp"
 4 | #include "csv_reader.hpp"
 5 | #include "data_type.hpp"
 6 | 
 7 | #include <string>
 8 | #include <type_traits>
 9 | #include <unordered_map>
10 | 
11 | namespace csv {
12 |     /** Returned by get_file_info() */
13 |     struct CSVFileInfo {
14 |         std::string filename;               /**< Filename */
15 |         std::vector<std::string> col_names; /**< CSV column names */
16 |         char delim;                         /**< Delimiting character */
17 |         size_t n_rows;                      /**< Number of rows in a file */
18 |         size_t n_cols;                      /**< Number of columns in a CSV */
19 |     };
20 | 
21 |     /** @name Shorthand Parsing Functions
22 |      *  @brief Convienience functions for parsing small strings
23 |      */
24 |      ///@{
25 |     CSVReader operator ""_csv(const char*, size_t);
26 |     CSVReader operator ""_csv_no_header(const char*, size_t);
27 |     CSVReader parse(csv::string_view in, CSVFormat format = CSVFormat());
28 |     CSVReader parse_no_header(csv::string_view in);
29 |     ///@}
30 | 
31 |     /** @name Utility Functions */
32 |     ///@{
33 |     std::unordered_map<std::string, DataType> csv_data_types(const std::string&);
34 |     CSVFileInfo get_file_info(const std::string& filename);
35 |     int get_col_pos(csv::string_view filename, csv::string_view col_name,
36 |         const CSVFormat& format = CSVFormat::guess_csv());
37 |     ///@}
38 | }


--------------------------------------------------------------------------------
/include/csv.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | CSV for C++, version 2.3.0
 3 | https://github.com/vincentlaucsb/csv-parser
 4 | 
 5 | MIT License
 6 | 
 7 | Copyright (c) 2017-2024 Vincent La
 8 | 
 9 | Permission is hereby granted, free of charge, to any person obtaining a copy
10 | of this software and associated documentation files (the "Software"), to deal
11 | in the Software without restriction, including without limitation the rights
12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 | copies of the Software, and to permit persons to whom the Software is
14 | furnished to do so, subject to the following conditions:
15 | 
16 | The above copyright notice and this permission notice shall be included in all
17 | copies or substantial portions of the Software.
18 | 
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 | */
27 | 
28 | #pragma once
29 | #ifndef CSV_HPP
30 | #define CSV_HPP
31 | 
32 | #include "internal/csv_reader.hpp"
33 | #include "internal/csv_stat.hpp"
34 | #include "internal/csv_utility.hpp"
35 | #include "internal/csv_writer.hpp"
36 | 
37 | /** INSERT_CSV_SOURCES **/
38 | 
39 | #endif


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Custom Settings
  2 | CMakeLists2.txt
  3 | 
  4 | # Build
  5 | bin/
  6 | build/
  7 | 
  8 | # Build: Python
  9 | *.pyc
 10 | *.pyd
 11 | 
 12 | # Doxygen
 13 | docs/html
 14 | *.tmp
 15 | 
 16 | # Visual Studio
 17 | .vs/
 18 | *.pdb
 19 | *.i*
 20 | *.*log
 21 | 
 22 | # Compiled Object files
 23 | *.slo
 24 | *.lo
 25 | *.o
 26 | *.obj
 27 | 
 28 | # Precompiled Headers
 29 | *.gch
 30 | *.pch
 31 | 
 32 | # Compiled Dynamic libraries
 33 | *.so
 34 | *.dylib
 35 | *.dll
 36 | 
 37 | # Fortran module files
 38 | *.mod
 39 | 
 40 | # Compiled Static libraries
 41 | *.lai
 42 | *.la
 43 | *.a
 44 | *.lib
 45 | 
 46 | # Executables
 47 | *.exe
 48 | *.out
 49 | *.app
 50 | 
 51 | # Test outputs
 52 | *.gcda
 53 | *.gcno
 54 | *.gcov
 55 | 
 56 | # =========================
 57 | # Operating System Files
 58 | # =========================
 59 | 
 60 | # OSX
 61 | # =========================
 62 | 
 63 | .DS_Store
 64 | .AppleDouble
 65 | .LSOverride
 66 | 
 67 | # Thumbnails
 68 | ._*
 69 | 
 70 | # Files that might appear in the root of a volume
 71 | .DocumentRevisions-V100
 72 | .fseventsd
 73 | .Spotlight-V100
 74 | .TemporaryItems
 75 | .Trashes
 76 | .VolumeIcon.icns
 77 | 
 78 | # Directories potentially created on remote AFP share
 79 | .AppleDB
 80 | .AppleDesktop
 81 | Network Trash Folder
 82 | Temporary Items
 83 | .apdisk
 84 | 
 85 | # Windows
 86 | # =========================
 87 | 
 88 | # Windows image file caches
 89 | Thumbs.db
 90 | ehthumbs.db
 91 | 
 92 | # Folder config file
 93 | Desktop.ini
 94 | 
 95 | # Recycle Bin used on file shares
 96 | $RECYCLE.BIN/
 97 | 
 98 | # Windows Installer files
 99 | *.cab
100 | *.msi
101 | *.msm
102 | *.msp
103 | 
104 | # Windows shortcuts
105 | *.lnk
106 | 


--------------------------------------------------------------------------------
/programs/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(csv_info ${CMAKE_CURRENT_LIST_DIR}/csv_info.cpp)
 2 | target_link_libraries(csv_info csv)
 3 | 
 4 | add_executable(csv_stats ${CMAKE_CURRENT_LIST_DIR}/csv_stats.cpp)
 5 | target_link_libraries(csv_stats csv)
 6 | 
 7 | # Provide rudimentary benchmarks
 8 | if(CSV_DEVELOPER)
 9 | 	add_executable(csv_guess_bench ${CMAKE_CURRENT_LIST_DIR}/csv_guess_bench.cpp)
10 | 	target_link_libraries(csv_guess_bench csv)
11 | 
12 | 	# Benchmarks for parsing speed
13 | 	add_executable(csv_bench ${CMAKE_CURRENT_LIST_DIR}/csv_bench.cpp)
14 | 	target_link_libraries(csv_bench csv)
15 | 
16 | 	add_custom_target(generate_csv_bench
17 | 		COMMAND csv_bench 2015_StateDepartment.csv
18 | 		WORKING_DIRECTORY ${CSV_TEST_DIR}/data/real_data
19 | 	)
20 | 
21 | 	# Don't compile programs if required headers are not found
22 | 	include(CheckCXXSourceCompiles)
23 | 	check_cxx_source_compiles("
24 | 		#include <charconv>
25 | 		
26 | 		int main(int argc, char** argv) {
27 | 			return 0;
28 | 		}
29 | 	" haveCharconv)
30 | 	
31 | 	check_cxx_source_compiles("
32 | 		#include <charconv>
33 | 		
34 | 		int main(int argc, char** argv) {
35 | 			const char* str = \"123.456\";
36 | 			long double d;
37 | 			std::from_chars(str, str + 7, d);
38 | 			return 0;
39 | 		}
40 | 	" FROM_CHARS_SUPPORT_DOUBLE)
41 | 
42 | 	if(haveCharconv)
43 | 		add_executable(csv_generator ${CMAKE_CURRENT_LIST_DIR}/csv_generator.cpp)
44 | 		target_link_libraries(csv_generator csv)
45 | 
46 | 		# Benchmarks for data_type() function
47 | 		if(FROM_CHARS_SUPPORT_DOUBLE)
48 | 			add_definitions(-DFROM_CHARS_SUPPORT_DOUBLE)
49 | 		endif()
50 | 		add_executable(data_type_bench ${CMAKE_CURRENT_LIST_DIR}/data_type_bench.cpp)
51 | 		target_link_libraries(data_type_bench csv)
52 | 
53 | 		add_custom_target(generate_dtype_bench
54 | 			COMMAND data_type_bench 2015_StateDepartment.csv "Regular Pay"
55 | 			WORKING_DIRECTORY ${CSV_TEST_DIR}/data/real_data)
56 |     endif()
57 | endif()


--------------------------------------------------------------------------------
/tests/test_csv_ranges.cpp:
--------------------------------------------------------------------------------
 1 | #include <catch2/catch_all.hpp>
 2 | #include <csv.hpp>
 3 | 
 4 | #ifdef CSV_HAS_CXX20
 5 | #include <ranges>
 6 | 
 7 | TEST_CASE("CSVReader C++20 Ranges Compatibility", "[ranges][cxx20]") {
 8 |     SECTION("CSVReader works with std::ranges::distance") {
 9 |         std::stringstream ss("A,B,C\n1,2,3\n4,5,6\n7,8,9");
10 |         csv::CSVReader reader(ss);
11 | 
12 |         auto count = std::ranges::distance(reader);
13 |         REQUIRE(count == 3);
14 |     }
15 | 
16 |     SECTION("CSVReader works with std::views") {
17 |         std::stringstream ss("A,B,C\n1,2,3\n4,5,6\n7,8,9\n10,11,12");
18 |         csv::CSVReader reader(ss);
19 | 
20 |         auto filtered = reader |
21 |                         std::views::filter([](const csv::CSVRow &row) {
22 |                             return !row.empty() && row[0].get<int>() > 5;
23 |                         });
24 | 
25 |         int filtered_count = 0;
26 |         for (const auto &row : filtered) {
27 |             filtered_count++;
28 |             int val = row[0].get<int>();
29 |             REQUIRE(val > 5);
30 |         }
31 |         REQUIRE(filtered_count == 2); // rows with 7 and 10
32 |     }
33 | 
34 |     SECTION("CSVReader iterator satisfies input_range requirements") {
35 |         std::stringstream ss("A,B\n1,2\n3,4");
36 |         csv::CSVReader reader(ss);
37 | 
38 |         auto it = reader.begin();
39 |         auto end = reader.end();
40 | 
41 |         static_assert(std::input_iterator<decltype(it)>);
42 |         static_assert(std::ranges::range<csv::CSVReader>);
43 |         static_assert(std::ranges::input_range<csv::CSVReader>);
44 |         static_assert(std::sentinel_for<decltype(end), decltype(it)>);
45 | 
46 |         REQUIRE(it != end);
47 |         auto row = *it;
48 |         REQUIRE(row.size() == 2);
49 | 
50 |         ++it;
51 |         REQUIRE(it != end);
52 | 
53 |         ++it;
54 |         REQUIRE(it == end);
55 |     }
56 | }
57 | #endif
58 | 


--------------------------------------------------------------------------------
/tests/test_round_trip.cpp:
--------------------------------------------------------------------------------
 1 | /** Tests of both reading and writing */
 2 | 
 3 | #include <array>
 4 | #include <catch2/catch_all.hpp>
 5 | #include <cstdio>
 6 | #include <iostream>
 7 | 
 8 | #include "csv.hpp"
 9 | 
10 | using namespace csv;
11 | 
12 | TEST_CASE("Simple Buffered Integer Round Trip Test", "[test_roundtrip_int]") {
13 |     auto filename = "round_trip.csv";
14 |     std::ofstream outfile(filename, std::ios::binary);
15 |     auto writer = make_csv_writer_buffered(outfile);
16 | 
17 |     writer << std::vector<std::string>({"A", "B", "C", "D", "E"});
18 | 
19 |     const size_t n_rows = 1000000;
20 | 
21 |     for (size_t i = 0; i < n_rows; i++) {
22 |         auto str = internals::to_string(i);
23 |         writer << std::array<csv::string_view, 5>({str, str, str, str, str});
24 |     }
25 |     writer.flush();
26 | 
27 |     CSVReader reader(filename);
28 | 
29 |     size_t i = 0;
30 |     for (auto &row : reader) {
31 |         for (auto &col : row) {
32 |             REQUIRE(col == i);
33 |         }
34 | 
35 |         i++;
36 |     }
37 | 
38 |     REQUIRE(reader.n_rows() == n_rows);
39 | 
40 |     remove(filename);
41 | }
42 | 
43 | TEST_CASE("Simple Integer Round Trip Test", "[test_roundtrip_int]") {
44 |     auto filename = "round_trip.csv";
45 |     std::ofstream outfile(filename, std::ios::binary);
46 |     auto writer = make_csv_writer(outfile);
47 | 
48 |     writer << std::vector<std::string>({ "A", "B", "C", "D", "E" });
49 | 
50 |     const size_t n_rows = 1000000;
51 | 
52 |     for (size_t i = 0; i < n_rows; i++) {
53 |         auto str = internals::to_string(i);
54 |         writer << std::array<csv::string_view, 5>({ str, str, str, str, str });
55 |     }
56 | 
57 |     CSVReader reader(filename);
58 | 
59 |     size_t i = 0;
60 |     for (auto& row : reader) {
61 |         for (auto& col : row) {
62 |             REQUIRE(col == i);
63 |         }
64 | 
65 |         i++;
66 |     }
67 | 
68 |     REQUIRE(reader.n_rows() == n_rows);
69 | 
70 |     remove(filename);
71 | }


--------------------------------------------------------------------------------
/tests/test_csv_format.cpp:
--------------------------------------------------------------------------------
 1 | #include <catch2/catch_all.hpp>
 2 | #include "csv.hpp"
 3 | using namespace csv;
 4 | 
 5 | static std::string err_preamble = "There should be no overlap between "
 6 |     "the quote character, the set of possible "
 7 |     "delimiters and the set of whitespace characters.";
 8 | 
 9 | // Assert that an error is thrown if whitespace, delimiter, and quote 
10 | TEST_CASE("CSVFormat - Overlapping Characters", "[csv_format_overlap]") {
11 |     CSVFormat format;
12 |     bool err_caught = false;
13 | 
14 |     SECTION("Tab") {
15 |         try {
16 |             format.delimiter('\t').quote('"').trim({ '\t' });
17 |         }
18 |         catch (std::runtime_error& err) {
19 |             err_caught = true;
20 |             REQUIRE(err.what() == std::string(err_preamble + " Offending characters: '\t'."));
21 |         }
22 | 
23 |         REQUIRE(err_caught);
24 |     }
25 | 
26 |     SECTION("Tab with multiple other characters") {
27 |         try {
28 |             format.delimiter({ ',', '\t' }).quote('"').trim({ ' ', '\t' });
29 |         }
30 |         catch (std::runtime_error& err) {
31 |             err_caught = true;
32 |             REQUIRE(err.what() == std::string(err_preamble + " Offending characters: '\t'."));
33 |         }
34 | 
35 |         REQUIRE(err_caught);
36 |     }
37 | 
38 |     SECTION("Repeated quote") {
39 |         try {
40 |             format.delimiter({ ',', '"' }).quote('"').trim({ ' ', '\t' });
41 |         }
42 |         catch (std::runtime_error& err) {
43 |             err_caught = true;
44 |             REQUIRE(err.what() == std::string(err_preamble + " Offending characters: '\"'."));
45 |         }
46 | 
47 |         REQUIRE(err_caught);
48 |     }
49 | 
50 |     SECTION("Multiple offenders") {
51 |         try {
52 |             format.delimiter({ ',', '\t', ' ' }).quote('"').trim({ ' ', '\t' });
53 |         }
54 |         catch (std::runtime_error& err) {
55 |             err_caught = true;
56 |             REQUIRE(err.what() == std::string(err_preamble + " Offending characters: '\t', ' '."));
57 |         }
58 | 
59 |         REQUIRE(err_caught);
60 |     }
61 | }


--------------------------------------------------------------------------------
/include/internal/csv_stat.hpp:
--------------------------------------------------------------------------------
 1 | /** @file
 2 |  *  Calculates statistics from CSV files
 3 |  */
 4 | 
 5 | #pragma once
 6 | #include <unordered_map>
 7 | #include <sstream>
 8 | #include <vector>
 9 | #include "csv_reader.hpp"
10 | 
11 | namespace csv {
12 |     /** Class for calculating statistics from CSV files and in-memory sources
13 |      *
14 |      *  **Example**
15 |      *  \include programs/csv_stats.cpp
16 |      *
17 |      */
18 |     class CSVStat {
19 |     public:
20 |         using FreqCount = std::unordered_map<std::string, size_t>;
21 |         using TypeCount = std::unordered_map<DataType, size_t>;
22 | 
23 |         std::vector<long double> get_mean() const;
24 |         std::vector<long double> get_variance() const;
25 |         std::vector<long double> get_mins() const;
26 |         std::vector<long double> get_maxes() const;
27 |         std::vector<FreqCount> get_counts() const;
28 |         std::vector<TypeCount> get_dtypes() const;
29 | 
30 |         std::vector<std::string> get_col_names() const {
31 |             return this->reader.get_col_names();
32 |         }
33 | 
34 |         CSVStat(csv::string_view filename, CSVFormat format = CSVFormat::guess_csv());
35 |         CSVStat(std::stringstream& source, CSVFormat format = CSVFormat());
36 |     private:
37 |         // An array of rolling averages
38 |         // Each index corresponds to the rolling mean for the column at said index
39 |         std::vector<long double> rolling_means;
40 |         std::vector<long double> rolling_vars;
41 |         std::vector<long double> mins;
42 |         std::vector<long double> maxes;
43 |         std::vector<FreqCount> counts;
44 |         std::vector<TypeCount> dtypes;
45 |         std::vector<long double> n;
46 | 
47 |         // Statistic calculators
48 |         void variance(const long double&, const size_t&);
49 |         void count(CSVField&, const size_t&);
50 |         void min_max(const long double&, const size_t&);
51 |         void dtype(CSVField&, const size_t&);
52 | 
53 |         void calc();
54 |         void calc_chunk();
55 |         void calc_worker(const size_t&);
56 | 
57 |         CSVReader reader;
58 |         std::deque<CSVRow> records = {};
59 |     };
60 | }


--------------------------------------------------------------------------------
/include/internal/csv_reader_iterator.cpp:
--------------------------------------------------------------------------------
 1 | /** @file
 2 |  *  Defines an input iterator for csv::CSVReader
 3 |  */
 4 | 
 5 | #include "csv_reader.hpp"
 6 | 
 7 | namespace csv {
 8 |     /** Return an iterator to the first row in the reader */
 9 |     CSV_INLINE CSVReader::iterator CSVReader::begin() {
10 |         if (this->records->empty()) {
11 |             this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE);
12 |             this->read_csv_worker.join();
13 | 
14 |             // Still empty => return end iterator
15 |             if (this->records->empty()) return this->end();
16 |         }
17 | 
18 |         this->_n_rows++;
19 |         CSVReader::iterator ret(this, this->records->pop_front());
20 |         return ret;
21 |     }
22 | 
23 |     /** A placeholder for the imaginary past the end row in a CSV.
24 |      *  Attempting to deference this will lead to bad things.
25 |      */
26 |     CSV_INLINE HEDLEY_CONST CSVReader::iterator CSVReader::end() const noexcept {
27 |         return CSVReader::iterator();
28 |     }
29 | 
30 |     /////////////////////////
31 |     // CSVReader::iterator //
32 |     /////////////////////////
33 | 
34 |     CSV_INLINE CSVReader::iterator::iterator(CSVReader* _daddy, CSVRow&& _row) :
35 |         daddy(_daddy) {
36 |         row = std::move(_row);
37 |     }
38 | 
39 |     /** Advance the iterator by one row. If this CSVReader has an
40 |      *  associated file, then the iterator will lazily pull more data from
41 |      *  that file until the end of file is reached.
42 |      *
43 |      *  @note This iterator does **not** block the thread responsible for parsing CSV.
44 |      *
45 |      */
46 |     CSV_INLINE CSVReader::iterator& CSVReader::iterator::operator++() {
47 |         if (!daddy->read_row(this->row)) {
48 |             this->daddy = nullptr; // this == end()
49 |         }
50 | 
51 |         return *this;
52 |     }
53 | 
54 |     /** Post-increment iterator */
55 |     CSV_INLINE CSVReader::iterator CSVReader::iterator::operator++(int) {
56 |         auto temp = *this;
57 |         if (!daddy->read_row(this->row)) {
58 |             this->daddy = nullptr; // this == end()
59 |         }
60 | 
61 |         return temp;
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/programs/data_type_bench.cpp:
--------------------------------------------------------------------------------
 1 | #include <charconv>
 2 | #include <chrono>
 3 | #include <algorithm>
 4 | #include "csv.hpp"
 5 | #ifndef NDEBUG
 6 | #define NDEBUG
 7 | #endif
 8 | 
 9 | long double get_max(std::string file, std::string column, bool use_std = false);
10 | 
11 | long double get_max(std::string file, std::string column, bool use_std) {
12 |     using namespace csv;
13 |     long double max = -std::numeric_limits<long double>::infinity();
14 |     CSVReader reader(file);
15 | 
16 |     for (auto& row : reader) {
17 |         auto field = row[column];
18 |         long double out = 0;
19 | 
20 |         if (use_std) {
21 |             auto _field = field.get<std::string_view>();
22 | #ifdef FROM_CHARS_SUPPORT_DOUBLE
23 |             auto data = _field.data();
24 |             std::from_chars(
25 |                 data, data + _field.size(),
26 |                 out
27 |             );
28 | #else
29 |             std::string str(_field);
30 |             std::stringstream ss(str);
31 |             ss >> out;
32 | #endif
33 |         }
34 |         else {
35 |             out = field.get<long double>();
36 |         }
37 | 
38 |         if (out > max) {
39 |             max = out;
40 |         }
41 |     }
42 | 
43 |     return max;
44 | }
45 | 
46 | int main(int argc, char** argv) {
47 |     using namespace csv;
48 | 
49 |     if (argc < 3) {
50 |         std::cout << "Usage: " << argv[0] << " [file] [column]" << std::endl;
51 |         exit(1);
52 |     }
53 | 
54 |     std::string file = argv[1],
55 |         column = argv[2];
56 | 
57 |     long double max = 0, std_avg = 0, csv_avg = 0;
58 |     const long double trials = 5;
59 |     
60 | 
61 |     for (size_t i = 0; i < trials; i++) {
62 |         auto start = std::chrono::system_clock::now();
63 |         max = get_max(file, column, true);
64 |         auto end = std::chrono::system_clock::now();
65 |         std::chrono::duration<double> diff = end - start;
66 |         std_avg += diff.count() / trials;
67 | 
68 |         start = std::chrono::system_clock::now();
69 |         max = get_max(file, column, false);
70 |         end = std::chrono::system_clock::now();
71 |         diff = end - start;
72 |         csv_avg += diff.count() / trials;
73 |     }
74 | 
75 |     std::cout << "std::from_chars: " << std_avg << std::endl;
76 |     std::cout << "csv::data_type: " << csv_avg << std::endl;
77 |     std::cout << "Maximum value: " << max << std::endl;
78 | 
79 |     return 0;
80 | }


--------------------------------------------------------------------------------
/tests/test_csv_row.cpp:
--------------------------------------------------------------------------------
 1 | // Tests for the CSVRow and CSVField Data Structures
 2 | 
 3 | #include <catch2/catch_all.hpp>
 4 | #include "csv.hpp"
 5 | using namespace csv;
 6 | 
 7 | // Construct a CSVRow and assert that its interface works as expected
 8 | TEST_CASE("CSVRow Test", "[test_csv_row]") {
 9 |     auto reader = "A,B,C,D\r\n"
10 |                   "Col1,Col2,Col3,Col4"_csv;
11 | 
12 |     CSVRow row;
13 |     reader.read_row(row);
14 |     
15 |     bool error_caught = false;
16 | 
17 |     SECTION("size() Check") {
18 |         REQUIRE(row.size() == 4);
19 |     }
20 | 
21 |     SECTION("operator[]") {
22 |         REQUIRE(row[1] == "Col2");
23 |         REQUIRE(row["B"] == "Col2");
24 | 
25 |         REQUIRE(row[2] == "Col3");
26 |         REQUIRE(row["C"] == "Col3");
27 |     }
28 | 
29 |     SECTION("operator[] Out of Bounds") {
30 |         try {
31 |             row[4].get<>();
32 |         }
33 |         catch (std::runtime_error&) {
34 |             error_caught = true;
35 |         }
36 | 
37 |         REQUIRE(error_caught);
38 |     }
39 | 
40 |     SECTION("operator[] Access Non-Existent Column") {
41 |         try {
42 |             row["Col5"].get<>();
43 |         }
44 |         catch (std::runtime_error&) {
45 |             error_caught = true;
46 |         }
47 | 
48 |         REQUIRE(error_caught);
49 |     }
50 | 
51 |     SECTION("Content Check") {
52 |         REQUIRE(std::vector<std::string>(row) ==
53 |             std::vector<std::string>({ "Col1", "Col2", "Col3", "Col4" }));
54 |     }
55 | 
56 |     /** Allow get_sv() to be used with a const CSVField
57 |      *  
58 |      *  See: https://github.com/vincentlaucsb/csv-parser/issues/86
59 |      *
60 |      */
61 |     SECTION("get_sv() Check") {
62 |         std::vector<std::string> content;
63 | 
64 |         for (const auto& field : row) {
65 |             content.push_back(std::string(field.get_sv()));
66 |         }
67 | 
68 |         REQUIRE(std::vector<std::string>(row) ==
69 |             std::vector<std::string>({ "Col1", "Col2", "Col3", "Col4" }));
70 |     }
71 | }
72 | 
73 | // Integration test for CSVRow/CSVField
74 | TEST_CASE("CSVField operator==", "[test_csv_field_equal]") {
75 |     auto reader = "A,B,C,D\r\n"
76 |                   "1,2,3,3.14"_csv;
77 | 
78 |     CSVRow row;
79 |     reader.read_row(row);
80 | 
81 |     REQUIRE(row["A"] == 1);
82 |     REQUIRE(row["B"] == 2);
83 |     REQUIRE(row["C"] == 3);
84 |     REQUIRE(internals::is_equal(row["D"].get<long double>(), 3.14L));
85 | }


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile used for building/testing on Travis CI
 2 | 
 3 | # Force Travis to use updated compilers
 4 | ifeq ($(TRAVIS_COMPILER), gcc)
 5 | 	CXX = g++-8
 6 | else ifeq ($(TRAVIS_COMPILER), clang)
 7 | 	CXX = clang++
 8 | endif
 9 | 
10 | ifeq ($(STD), )
11 | 	STD = c++11
12 | endif
13 | 
14 | BUILD_DIR = build
15 | SOURCE_DIR = include
16 | SINGLE_INCLUDE_DIR = single_include
17 | TEST_DIR = tests
18 | CFLAGS = -pthread -std=$(STD)
19 | 
20 | TEST_OFLAGS =
21 | ifeq ($(CXX), g++-8)
22 | 	TEST_OFLAGS = -Og
23 | endif
24 | 
25 | TEST_FLAGS = -Itests/ $(CFLAGS) $(TEST_OFLAGS) -g --coverage -Wno-unknown-pragmas -Wall
26 | 
27 | # Main Library
28 | SOURCES = $(wildcard include/internal/*.cpp)
29 | OBJECTS = $(subst .cpp,.o,$(subst src/,$(BUILD_DIR)/,$(SOURCES)))
30 | 
31 | TEST_SOURCES = $(wildcard tests/*.cpp)
32 | TEST_SOURCES_NO_EXT = $(subst tests/,,$(subst .cpp,,$(TEST_SOURCES)))
33 | 
34 | all: csv_parser test_all clean distclean
35 | 
36 | ################
37 | # Main Library #
38 | ################
39 | csv:
40 | 	$(CXX) -c -O3 $(CFLAGS) $(SOURCES)
41 | 	mkdir -p $(BUILD_DIR)
42 | 	mv *.o $(BUILD_DIR)
43 | 	
44 | libcsv.a:
45 | 	make csv
46 | 	ar rvs libcsv.a $(wildcard build/*.o)
47 | 	
48 | docs:
49 | 	doxygen Doxyfile
50 | 	
51 | ############
52 | # Programs #
53 | ############
54 | csv_stats:
55 | 	$(CXX) -o csv_stats -O3 $(CFLAGS) programs/csv_stats.cpp -I$(SINGLE_INCLUDE_DIR)
56 | 	
57 | #########
58 | # Tests #
59 | #########	
60 | csv_test:
61 | 	$(CXX) -o csv_test $(SOURCES) $(TEST_SOURCES) -I${SOURCE_DIR} $(TEST_FLAGS)
62 | 	
63 | run_csv_test: csv_test
64 | 	mkdir -p tests/temp
65 | 	./csv_test
66 | 	
67 | 	# Test Clean-Up
68 | 	rm -rf $(TEST_DIR)/temp
69 | 	
70 | # Run code coverage analysis
71 | code_cov: csv_test
72 | 	mkdir -p test_results
73 | 	mv *.gcno *.gcda $(PWD)/test_results
74 | 	gcov-8 $(SOURCES) -o test_results --relative-only
75 | 	mv *.gcov test_results
76 | 	
77 | # Generate report
78 | code_cov_report:
79 | 	cd test_results
80 | 	lcov --capture --directory test_results --output-file coverage.info
81 | 	genhtml coverage.info --output-directory out
82 | 
83 | valgrind: csv_stats
84 | 	# Can't run valgrind against csv_test because it mangles the working directory
85 | 	# which causes csv_test to not be able to find test files
86 | 	valgrind --leak-check=full ./csv_stats $(TEST_DIR)/data/real_data/2016_Gaz_place_national.txt
87 | 	
88 | .PHONY: all clean distclean
89 | 	
90 | clean:
91 | 	rm -f build/*
92 | 	rm -f *.gc*
93 | 	rm -f libcsv.a
94 | 	rm -f csv_*
95 | 	
96 | distclean: clean


--------------------------------------------------------------------------------
/include/internal/csv_utility.cpp:
--------------------------------------------------------------------------------
 1 | #include <sstream>
 2 | #include <vector>
 3 | 
 4 | #include "csv_utility.hpp"
 5 | 
 6 | namespace csv {
 7 |     /** Shorthand function for parsing an in-memory CSV string
 8 |      *
 9 |      *  @return A collection of CSVRow objects
10 |      *
11 |      *  @par Example
12 |      *  @snippet tests/test_read_csv.cpp Parse Example
13 |      */
14 |     CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) {
15 |         std::stringstream stream(std::string(in.data(), in.length()));
16 |         return CSVReader(stream, format);
17 |     }
18 | 
19 |     /** Parses a CSV string with no headers
20 |      *
21 |      *  @return A collection of CSVRow objects
22 |      */
23 |     CSV_INLINE CSVReader parse_no_header(csv::string_view in) {
24 |         CSVFormat format;
25 |         format.header_row(-1);
26 | 
27 |         return parse(in, format);
28 |     }
29 | 
30 |     /** Parse a RFC 4180 CSV string, returning a collection
31 |      *  of CSVRow objects
32 |      *
33 |      *  @par Example
34 |      *  @snippet tests/test_read_csv.cpp Escaped Comma
35 |      *
36 |      */
37 |     CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) {
38 |         return parse(csv::string_view(in, n));
39 |     }
40 | 
41 |     /** A shorthand for csv::parse_no_header() */
42 |     CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) {
43 |         return parse_no_header(csv::string_view(in, n));
44 |     }
45 | 
46 |     /**
47 |      *  Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise
48 |      *
49 |      *  @param[in] filename  Path to CSV file
50 |      *  @param[in] col_name  Column whose position we should resolve
51 |      *  @param[in] format    Format of the CSV file
52 |      */
53 |     CSV_INLINE int get_col_pos(
54 |         csv::string_view filename,
55 |         csv::string_view col_name,
56 |         const CSVFormat& format) {
57 |         CSVReader reader(filename, format);
58 |         return reader.index_of(col_name);
59 |     }
60 | 
61 |     /** Get basic information about a CSV file
62 |      *  @include programs/csv_info.cpp
63 |      */
64 |     CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) {
65 |         CSVReader reader(filename);
66 |         CSVFormat format = reader.get_format();
67 |         for (auto it = reader.begin(); it != reader.end(); ++it);
68 | 
69 |         CSVFileInfo info = {
70 |             filename,
71 |             reader.get_col_names(),
72 |             format.get_delim(),
73 |             reader.n_rows(),
74 |             reader.get_col_names().size()
75 |         };
76 | 
77 |         return info;
78 |     }
79 | }


--------------------------------------------------------------------------------
/tests/test_csv_field_array.cpp:
--------------------------------------------------------------------------------
 1 | #include <catch2/catch_all.hpp>
 2 | #include <future>
 3 | 
 4 | #include "csv.hpp"
 5 | 
 6 | using namespace csv;
 7 | using namespace csv::internals;
 8 | 
 9 | TEST_CASE("Test Dynamic RawCSVFieldArray - Emplace Back", "[test_dynamic_array_emplace]") {
10 |     using namespace csv::internals;
11 | 
12 |     constexpr size_t offset = 100;
13 | 
14 |     // Array size should be smaller than the number of items we want to push
15 |     CSVFieldList arr(500);
16 | 
17 |     for (size_t i = 0; i < 9999; i++) {
18 |         arr.emplace_back(i, i + offset);
19 | 
20 |         // Check operator[] as field was just populated
21 |         REQUIRE(arr[i].start == i);
22 |         REQUIRE(arr[i].length == i + offset);
23 | 
24 |         REQUIRE(arr.size() == i + 1);
25 |     }
26 | 
27 |     for (size_t i = 0; i < 9999; i++) {
28 |         // Check for potential data corruption
29 |         REQUIRE(arr[i].start == i);
30 |         REQUIRE(arr[i].length == i + offset);
31 |     }
32 | }
33 | 
34 | TEST_CASE("Test CSVFieldArray Thread Safety", "[test_array_thread]") {
35 |     constexpr size_t offset = 100;
36 | 
37 |     // Array size should be smaller than the number of items we want to push
38 |     CSVFieldList arr(500);
39 | 
40 |     for (size_t i = 0; i < 9999; i++) {
41 |         arr.emplace_back(i, i + offset);
42 | 
43 |         // Check operator[] as field was just populated
44 |         REQUIRE(arr[i].start == i);
45 |         REQUIRE(arr[i].length == i + offset);
46 | 
47 |         REQUIRE(arr.size() == i + 1);
48 |     }
49 | 
50 |     // Check contents from another thread
51 |     constexpr size_t num_workers = 4;
52 |     constexpr size_t chunk_size = 9999 / num_workers;
53 |     std::vector<std::future<bool>> workers = {};
54 | 
55 |     for (size_t i = 0; i < num_workers; i++) {
56 |         size_t start = i * chunk_size;
57 |         size_t end = start + chunk_size;
58 |         
59 |         workers.push_back(
60 |             std::async([](const CSVFieldList& arr, size_t start, size_t end, size_t offset) {
61 |                 for (size_t i = start; i < end; i++) {
62 |                     if (arr[i].start != i || arr[i].length != i + offset)
63 |                         return false;
64 |                 }
65 | 
66 |                 return true;
67 |             }, std::ref(arr), start, end, offset)
68 |         );
69 |     }
70 | 
71 |     // Writer from another thread
72 |     for (size_t i = 9999; i < 19999; i++) {
73 |         arr.emplace_back(i, i + offset);
74 | 
75 |         // Check operator[] as field was just populated
76 |         REQUIRE(arr[i].start == i);
77 |         REQUIRE(arr[i].length == i + offset);
78 | 
79 |         REQUIRE(arr.size() == i + 1);
80 |     }
81 | 
82 |     for (auto& result : workers) {
83 |         REQUIRE(result.get() == true);
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language:
 2 |   - cpp
 3 | matrix:
 4 |   include:
 5 |     - os: linux
 6 |       env: STD=c++11 CSV_CXX_STANDARD=11 CXX_COMPILER=g++-9 C_COMPILER=gcc-9
 7 |       compiler: gcc
 8 |       addons:
 9 |         apt:
10 |           sources: ['ubuntu-toolchain-r-test']
11 |           packages: ['g++-9', 'cmake', 'valgrind', 'doxygen']
12 |     - os: linux
13 |       env: STD=c++14 CSV_CXX_STANDARD=14 CXX_COMPILER=g++-9 C_COMPILER=gcc-9
14 |       compiler: gcc
15 |       addons:
16 |         apt:
17 |           sources: ['ubuntu-toolchain-r-test']
18 |           packages: ['g++-9', 'cmake', 'doxygen']
19 |     - os: linux
20 |       env: STD=c++17 CSV_CXX_STANDARD=17 MAIN_BUILD=true CXX_COMPILER=g++-9 C_COMPILER=gcc-9
21 |       compiler: gcc
22 |       addons:
23 |         apt:
24 |           sources: ['ubuntu-toolchain-r-test']
25 |           packages: ['g++-9', 'cmake', 'valgrind', 'doxygen']
26 |     - os: linux
27 |       dist: focal
28 |       env: CSV_CXX_STANDARD=11 CXX_COMPILER=clang++-11 C_COMPILER=clang-11
29 |       compiler: clang
30 |       addons:
31 |          apt:
32 |           sources:
33 |           - sourceline: 'deb http://apt.llvm.org/focal/ llvm-toolchain-focal-11 main'
34 |             key_url: https://apt.llvm.org/llvm-snapshot.gpg.key
35 |           packages:
36 |           - clang-11
37 |     - os: linux
38 |       dist: focal
39 |       env: CSV_CXX_STANDARD=14 CXX_COMPILER=clang++-11 C_COMPILER=clang-11
40 |       compiler: clang
41 |       addons:
42 |          apt:
43 |           sources:
44 |           - sourceline: 'deb http://apt.llvm.org/focal/ llvm-toolchain-focal-11 main'
45 |             key_url: https://apt.llvm.org/llvm-snapshot.gpg.key
46 |           packages:
47 |           - clang-11
48 |     - os: linux
49 |       dist: focal
50 |       env: CSV_CXX_STANDARD=17 CXX_COMPILER=clang++-11 C_COMPILER=clang-11
51 |       compiler: clang
52 |       addons:
53 |          apt:
54 |           sources:
55 |           - sourceline: 'deb http://apt.llvm.org/focal/ llvm-toolchain-focal-11 main'
56 |             key_url: https://apt.llvm.org/llvm-snapshot.gpg.key
57 |           packages:
58 |           - clang-11
59 | dist: trusty
60 | sudo: required
61 | script:
62 |   - export CSV_TEST_ROOT=$PWD/tests
63 |   - cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=$CXX_COMPILER  -DCMAKE_C_COMPILER=$C_COMPILER -DCSV_CXX_STANDARD=$CSV_CXX_STANDARD
64 |   - make csv_test
65 |   - ./tests/csv_test
66 |   
67 |   # Memory leak check
68 |   - if [ "$MAIN_BUILD" == "true" ]; then
69 |         make csv_stats;
70 |         valgrind --leak-check=full ./programs/csv_stats $PWD/tests/data/real_data/2016_Gaz_place_national.txt;
71 |     fi;
72 | after_success:
73 |   - if [ "$MAIN_BUILD" == "true" ]; then
74 |         doxygen Doxyfile;
75 |         mv csv_coverage ./docs;
76 |     fi;
77 | deploy:
78 |   provider: pages:git
79 |   edge: true
80 |   cleanup: false
81 |   token: $GITHUB_TOKEN
82 |   keep_history: true
83 |   local_dir: docs
84 |   target_branch: gh-pages
85 |   on:
86 |     branch: master
87 | 


--------------------------------------------------------------------------------
/include/internal/csv_format.cpp:
--------------------------------------------------------------------------------
 1 | /** @file
 2 |  *  Defines an object used to store CSV format settings
 3 |  */
 4 | 
 5 | #include <algorithm>
 6 | #include <set>
 7 | 
 8 | #include "csv_format.hpp"
 9 | 
10 | namespace csv {
11 |     CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) {
12 |         this->possible_delimiters = { delim };
13 |         this->assert_no_char_overlap();
14 |         return *this;
15 |     }
16 | 
17 |     CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector<char> & delim) {
18 |         this->possible_delimiters = delim;
19 |         this->assert_no_char_overlap();
20 |         return *this;
21 |     }
22 | 
23 |     CSV_INLINE CSVFormat& CSVFormat::quote(char quote) {
24 |         this->no_quote = false;
25 |         this->quote_char = quote;
26 |         this->assert_no_char_overlap();
27 |         return *this;
28 |     }
29 | 
30 |     CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector<char> & chars) {
31 |         this->trim_chars = chars;
32 |         this->assert_no_char_overlap();
33 |         return *this;
34 |     }
35 | 
36 |     CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector<std::string>& names) {
37 |         this->col_names = names;
38 |         this->header = -1;
39 |         return *this;
40 |     }
41 | 
42 |     CSV_INLINE CSVFormat& CSVFormat::header_row(int row) {
43 |         if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP;
44 | 
45 |         this->header = row;
46 |         this->col_names = {};
47 |         return *this;
48 |     }
49 | 
50 |     CSV_INLINE void CSVFormat::assert_no_char_overlap()
51 |     {
52 |         auto delims = std::set<char>(
53 |             this->possible_delimiters.begin(), this->possible_delimiters.end()),
54 |             trims = std::set<char>(
55 |                 this->trim_chars.begin(), this->trim_chars.end());
56 | 
57 |         // Stores intersection of possible delimiters and trim characters
58 |         std::vector<char> intersection = {};
59 | 
60 |         // Find which characters overlap, if any
61 |         std::set_intersection(
62 |             delims.begin(), delims.end(),
63 |             trims.begin(), trims.end(),
64 |             std::back_inserter(intersection));
65 | 
66 |         // Make sure quote character is not contained in possible delimiters
67 |         // or whitespace characters
68 |         if (delims.find(this->quote_char) != delims.end() ||
69 |             trims.find(this->quote_char) != trims.end()) {
70 |             intersection.push_back(this->quote_char);
71 |         }
72 | 
73 |         if (!intersection.empty()) {
74 |             std::string err_msg = "There should be no overlap between the quote character, "
75 |                 "the set of possible delimiters "
76 |                 "and the set of whitespace characters. Offending characters: ";
77 | 
78 |             // Create a pretty error message with the list of overlapping
79 |             // characters
80 |             for (size_t i = 0; i < intersection.size(); i++) {
81 |                 err_msg += "'";
82 |                 err_msg += intersection[i];
83 |                 err_msg += "'";
84 | 
85 |                 if (i + 1 < intersection.size())
86 |                     err_msg += ", ";
87 |             }
88 | 
89 |             throw std::runtime_error(err_msg + '.');
90 |         }
91 |     }
92 | }


--------------------------------------------------------------------------------
/tests/test_csv_stat.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstring>
 2 | #include <catch2/catch_all.hpp>
 3 | #include "csv.hpp"
 4 | using namespace csv;
 5 | 
 6 | const std::string PERSONS_CSV = "./tests/data/mimesis_data/persons.csv";
 7 | 
 8 | // Regression test for #208: Try to parse an empty file shouldn't result in a SEGFAULT
 9 | TEST_CASE("Empty File", "[read_csv_stat_empty]") {
10 |     bool error_caught = false;
11 | 
12 |     try {
13 |         CSVStat stats("./tests/data/fake_data/empty.csv");
14 |         stats.get_mins();
15 |     }
16 |     catch (std::runtime_error& err) {
17 |         error_caught = true;
18 |         REQUIRE(strcmp(err.what(), "Cannot open file ./tests/data/fake_data/empty.csv") == 0);
19 |     }
20 | 
21 |     REQUIRE(error_caught);
22 | }
23 | 
24 | TEST_CASE("Calculating Statistics from Direct Input", "[read_csv_stat_direct]" ) {
25 |     std::string int_str;
26 |     std::stringstream int_list;
27 |     for (int i = 1; i < 101; i++) {
28 |         int_str = std::to_string(i);
29 |         int_list << int_str << "," << int_str << "," << int_str << "\r\n";
30 |     }
31 | 
32 |     // Expected results
33 |     CSVFormat format;
34 |     format.column_names({ "A", "B", "C" });
35 | 
36 |     CSVStat reader(int_list, format);
37 | 
38 |     std::vector<long double> means = { 50.5, 50.5, 50.5 };
39 |     std::vector<long double> mins = { 1, 1, 1 };
40 |     std::vector<long double> maxes = { 100, 100, 100 };
41 | 
42 |     REQUIRE( reader.get_mins() == mins );
43 |     REQUIRE( reader.get_maxes() == maxes );
44 |     REQUIRE( reader.get_mean() == means );
45 |     REQUIRE( ceill(reader.get_variance()[0]) == 842 );
46 | 
47 |     // Make sure all integers between 1 and 100 have a count of 1
48 |     for (int i = 1; i < 101; i++)
49 |         REQUIRE( reader.get_counts()[0][std::to_string(i)] == 1 );
50 | 
51 |     // Confirm column at pos 0 has 100 integers (type 2)
52 |     REQUIRE( reader.get_dtypes()[0][DataType::CSV_INT8] == 100 );
53 | }
54 | 
55 | TEST_CASE( "Statistics - Rows of Integers", "[read_csv_stat]" ) {
56 |     // Header on first row
57 |     auto file = GENERATE(as<std::string> {},
58 |         "./tests/data/fake_data/ints.csv",
59 |         "./tests/data/fake_data/ints_newline_sep.csv"
60 |     );
61 | 
62 |     SECTION("Compute Statistics") {
63 |         CSVStat reader(file);
64 | 
65 |         // Expected Results
66 |         std::vector<long double> means = {
67 |             50.5, 50.5, 50.5, 50.5, 50.5,
68 |             50.5, 50.5, 50.5, 50.5, 50.5
69 |         };
70 | 
71 |         REQUIRE(reader.get_mean() == means);
72 |         REQUIRE(reader.get_mins()[0] == 1);
73 |         REQUIRE(reader.get_maxes()[0] == 100);
74 |         REQUIRE(ceill(reader.get_variance()[0]) == 842);
75 |     }
76 | }
77 | 
78 | TEST_CASE( "Statistics - persons.csv", "[test_stat_person]" ) {
79 |     CSVStat reader(PERSONS_CSV);
80 |     REQUIRE(reader.get_maxes()[0] == 49999);
81 |     REQUIRE( ceill(reader.get_mean()[2]) == 42 );
82 | }
83 | 
84 | TEST_CASE("Data Types - persons.csv", "[test_dtypes_person]") {
85 |     auto dtypes = csv_data_types(PERSONS_CSV);
86 | 
87 |     REQUIRE(dtypes["Full Name"] == DataType::CSV_STRING);
88 |     REQUIRE(dtypes["Age"] == DataType::CSV_INT8);
89 |     REQUIRE(dtypes["Occupation"] == DataType::CSV_STRING);
90 |     REQUIRE(dtypes["Email"] == DataType::CSV_STRING);
91 |     REQUIRE(dtypes["Telephone"] == DataType::CSV_STRING);
92 |     REQUIRE(dtypes["Nationality"] == DataType::CSV_STRING);
93 | }
94 | 


--------------------------------------------------------------------------------
/docs/source/Doxy.md:
--------------------------------------------------------------------------------
 1 | # Vince's CSV Library
 2 | 
 3 | This is the detailed documentation for Vince's CSV library. 
 4 | For quick examples, go to this project's [GitHub page](https://github.com/vincentlaucsb/csv-parser).
 5 | 
 6 | ## Outline
 7 | 
 8 | ### CSV Reading
 9 |  * csv::CSVFormat: \copybrief csv::CSVFormat
10 |  * csv::CSVReader
11 |   * csv::CSVReader::n_rows(): \copybrief csv::CSVReader::n_rows()
12 |   * csv::CSVReader::utf8_bom(): \copybrief csv::CSVReader::utf8_bom()
13 |   * csv::CSVReader::get_format(): \copybrief csv::CSVReader::get_format()
14 |   * Retrieving data
15 |       * csv::CSVReader::iterator: Recommended
16 |         * csv::CSVReader::begin()
17 |         * csv::CSVReader::end()
18 |       * csv::CSVReader::read_row()
19 |  * Convenience Functions
20 |   * csv::parse()
21 |   * csv::operator ""_csv()
22 |   * csv::parse_no_header()
23 |   * csv::operator ""_csv_no_header()
24 | 
25 |  #### See also
26 |  [Dealing with Variable Length CSV Rows](md_docs_source_variable_row_lengths.html)
27 | 
28 |  #### Working with parsed data
29 |  * csv::CSVRow: \copybrief csv::CSVRow
30 |   * csv::CSVRow::operator std::vector<std::string>()
31 |   * csv::CSVRow::iterator
32 |     * csv::CSVRow::begin()
33 |     * csv::CSVRow::end()
34 |   * csv::CSVRow::to_json()
35 |   * csv::CSVRow::to_json_array()
36 |  * csv::CSVField
37 |   * csv::CSVField::get(): \copybrief csv::CSVField::get()
38 |   * csv::CSVField::operator==()
39 | 
40 | ### Statistics
41 |  * csv::CSVStat
42 | 
43 | ### CSV Writing
44 |  * csv::make_csv_writer(): Construct a csv::CSVWriter
45 |  * csv::make_tsv_writer(): Construct a csv::TSVWriter
46 |  * csv::DelimWriter
47 |    * Pre-Defined Specializations
48 |      * csv::CSVWriter
49 |      * csv::TSVWriter
50 |    * Methods
51 |      * csv::DelimWriter::operator<<()
52 | 
53 | ## Frequently Asked Questions
54 | 
55 | ### How does automatic starting row detection work?
56 | See "How does automatic delimiter detection work?"
57 | 
58 | ### How does automatic delimiter detection work?
59 | First, the CSV reader attempts to parse the first 100 lines of a CSV file as if the delimiter were a pipe, tab, comma, etc.
60 | Out of all the possible delimiter choices, the delimiter which produces the highest number of `rows * columns` (where all rows
61 | are of a consistent length) is chosen as the winner.
62 | 
63 | However, if the CSV file has leading comments, or has less than 100 lines, a second heuristic will be used. The CSV reader again
64 | parses the first 100 lines using each candidate delimiter, but tallies up the length of each row parsed. Then, the delimiter with
65 | the largest most common row length `n` is chosen as the winner, and the line number where the first row of length `n` occurs
66 | is chosen as the starting row.
67 | 
68 | Because you can subclass csv::CSVReader, you can implement your own guessing hueristic. csv::internals::CSVGuesser may be used as a helpful guide in doing so.
69 | 
70 | ### Is the CSV parser thread-safe?
71 | This library already does a lot of work behind the scenes to use threads to squeeze
72 | performance from your CPU. However, ambitious users who are in the mood for
73 | experimenting should follow these guidelines:
74 |  * csv::CSVReader::iterator should only be used from one thread
75 |    * A workaround is to chunk blocks of `CSVRow` objects together and 
76 |      create separate threads to process each column
77 |  * csv::CSVRow may be safely processed from multiple threads
78 |  * csv::CSVField objects should only be read from one thread at a time
79 |    * **Note**: csv::CSVRow::operator[]() produces separate copies of `csv::CSVField` objects


--------------------------------------------------------------------------------
/.github/workflows/cmake-multi-platform.yml:
--------------------------------------------------------------------------------
 1 | # This starter workflow is for a CMake project running on multiple platforms. There is a different starter workflow if you just want a single platform.
 2 | # See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-single-platform.yml
 3 | name: CMake on multiple platforms
 4 | 
 5 | on:
 6 |   push:
 7 |     branches: [ "master", "memory-fix-csvfieldlist" ]
 8 |   pull_request:
 9 |     branches: [ "master" ]
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ${{ matrix.os }}
14 | 
15 |     strategy:
16 |       # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable.
17 |       fail-fast: false
18 | 
19 |       # Set up a matrix to run the following 3 configurations:
20 |       # 1. <Windows, Release, latest MSVC compiler toolchain on the default runner image, default generator>
21 |       # 2. <Linux, Release, latest GCC compiler toolchain on the default runner image, default generator>
22 |       # 3. <Linux, Release, latest Clang compiler toolchain on the default runner image, default generator>
23 |       #
24 |       # To add more build types (Release, Debug, RelWithDebInfo, etc.) customize the build_type list.
25 |       matrix:
26 |         os: [windows-latest, ubuntu-latest]
27 |         build_type: [Release]
28 |         c_compiler: [gcc, cl]
29 |         cxx_standard: [17, 20]
30 |         include:
31 |           - os: windows-latest
32 |             c_compiler: cl
33 |             cpp_compiler: cl
34 |           - os: ubuntu-latest
35 |             c_compiler: gcc
36 |             cpp_compiler: g++
37 |         exclude:
38 |           - os: windows-latest
39 |             c_compiler: gcc
40 |           - os: ubuntu-latest
41 |             c_compiler: cl
42 | 
43 |     steps:
44 |     - name: Checkout repository and submodules
45 |       uses: actions/checkout@v4
46 |       with:
47 |         submodules: recursive
48 | 
49 |     - name: Set reusable strings
50 |       # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
51 |       id: strings
52 |       shell: bash
53 |       run: |
54 |         echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"
55 | 
56 |     - name: Configure CMake
57 |       # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
58 |       # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
59 |       run: >
60 |         cmake -B ${{ steps.strings.outputs.build-output-dir }}
61 |         -DCSV_CXX_STANDARD=${{ matrix.cxx_standard }}
62 |         -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
63 |         -DCMAKE_C_COMPILER=${{ matrix.c_compiler }}
64 |         -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
65 |         -S ${{ github.workspace }}
66 | 
67 |     - name: Build
68 |       # Build your program with the given configuration. Note that --config is needed because the default Windows generator is a multi-config generator (Visual Studio generator).
69 |       run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
70 | 
71 |     - name: Test
72 |       working-directory: ${{ steps.strings.outputs.build-output-dir }}
73 |       # Execute tests defined by the CMake configuration. Note that --build-config is needed because the default Windows generator is a multi-config generator (Visual Studio generator).
74 |       # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
75 |       run: ctest --build-config ${{ matrix.build_type }}
76 | 


--------------------------------------------------------------------------------
/tests/test_csv_row_json.cpp:
--------------------------------------------------------------------------------
  1 | #include <catch2/catch_all.hpp>
  2 | #include "csv.hpp"
  3 | 
  4 | #include <sstream>
  5 | using namespace csv;
  6 | 
  7 | /** Construct a CSVRow object for testing given column names and CSV fields */
  8 | CSVRow make_csv_row(std::vector<std::string> data, std::vector<std::string> col_names) {
  9 |     // Concatenate vector or strings into one large string
 10 |     using namespace csv::internals;
 11 | 
 12 |     std::stringstream raw_csv;
 13 |     auto writer = make_csv_writer(raw_csv);
 14 |     writer << col_names;
 15 |     writer << data;
 16 | 
 17 |     CSVReader reader(raw_csv);
 18 |     CSVRow row;
 19 |     reader.read_row(row);
 20 | 
 21 |     return row;
 22 | }
 23 | 
 24 | TEST_CASE("json_escape_string() Test", "[json_escape_string]") {
 25 |     using internals::json_escape_string;
 26 | 
 27 |     // Assert that special characters are escaped properly
 28 |     REQUIRE(json_escape_string("Quote\"Quote") == "Quote\\\"Quote");
 29 |     REQUIRE(json_escape_string("RSolidus\\RSolidus")
 30 |         == "RSolidus\\\\RSolidus");
 31 |     REQUIRE(json_escape_string("Backspace\bBackspace")
 32 |         == "Backspace\\bBackspace");
 33 |     REQUIRE(json_escape_string("Formfeed\fFormfeed")
 34 |         == "Formfeed\\fFormfeed");
 35 |     REQUIRE(json_escape_string("Newline\nNewline")
 36 |         == "Newline\\nNewline");
 37 |     REQUIRE(json_escape_string("CarriageReturn\rCarriageReturn")
 38 |         == "CarriageReturn\\rCarriageReturn");
 39 |     REQUIRE(json_escape_string("Tab\tTab")
 40 |         == "Tab\\tTab");
 41 | 
 42 |     // Assert that control characters are escaped properly
 43 |     REQUIRE(json_escape_string("Null\0Null")
 44 |         == "Null\u0000Null");
 45 | }
 46 | 
 47 | TEST_CASE("CSVRow to_json() Test", "[csv_row_to_json]") {
 48 |     CSVRow row = make_csv_row(
 49 |         { "Col 1", "Col 2" },   // Fields
 50 |         { "A", "B" }            // Column names
 51 |     );
 52 | 
 53 |     REQUIRE(row.to_json() == "{\"A\":\"Col 1\",\"B\":\"Col 2\"}");
 54 | }
 55 | 
 56 | TEST_CASE("CSVRow to_json() Test with Numbers", "[csv_numeric_row_to_json]") {
 57 |     CSVRow row = make_csv_row(
 58 |         { "1234.3", "234" },    // Fields
 59 |         { "A", "B"}             // Column names
 60 |     );
 61 | 
 62 |     REQUIRE(row.to_json() == "{\"A\":1234.3,\"B\":234}");
 63 | }
 64 | 
 65 | TEST_CASE("CSVRow to_json() Test - Mixed", "[csv_mixed_row_to_json]") {
 66 |     CSVRow row = make_csv_row(
 67 |         { "1234.3", "234", "ABCD", "AB1", "1337" },     // Fields
 68 |         { "A", "B", "C", "D", "E" }                     // Column names
 69 |     );
 70 | 
 71 |     SECTION("Full Row") {
 72 |         REQUIRE(row.to_json() == "{\"A\":1234.3,\"B\":234,\"C\":\"ABCD\",\"D\":\"AB1\",\"E\":1337}");
 73 |     }
 74 | 
 75 |     SECTION("Subset") {
 76 |         REQUIRE(row.to_json({ "B", "C" }) == "{\"B\":234,\"C\":\"ABCD\"}");
 77 |         REQUIRE(row.to_json({ "B", "A" }) == "{\"B\":234,\"A\":1234.3}");
 78 |     }
 79 | }
 80 | 
 81 | TEST_CASE("CSVRow to_json_array() Test() - Mixed", "[csv_mixed_row_to_json_array]") {
 82 |     CSVRow row = make_csv_row(
 83 |         { "1234.3", "234", "ABCD", "AB1", "1337" },     // Fields
 84 |         { "A", "B", "C", "D", "E" }                     // Column names
 85 |     );
 86 | 
 87 |     SECTION("Full Row") {
 88 |         REQUIRE(row.to_json_array() == "[1234.3,234,\"ABCD\",\"AB1\",1337]");
 89 |     }
 90 | 
 91 |     SECTION("Subset") {
 92 |         REQUIRE(row.to_json_array({ "B", "C" }) == "[234,\"ABCD\"]");
 93 |         REQUIRE(row.to_json_array({ "B", "A" }) == "[234,1234.3]");
 94 |     }
 95 | }
 96 | 
 97 | // Reported in: https://github.com/vincentlaucsb/csv-parser/issues/68
 98 | TEST_CASE("CSVRow to_json() with Wrong Columns", "[csv_json_wrong_cols]") {
 99 |     std::stringstream csv_string(R"(A,B,C,
100 | 123,345,678,)");
101 | 
102 |     auto format = CSVFormat();
103 |     format.column_names({ "A", "B" });
104 | 
105 |     CSVReader reader(csv_string, format);
106 |     CSVRow first_row;
107 |     reader.read_row(first_row);
108 | 
109 |     // Since the column names provided were wrong, there won't be any data.
110 |     // to_json() method should then produce an empty object instead of segfaulting.
111 |     REQUIRE(first_row.to_json() == "{}");
112 |     REQUIRE(first_row.to_json_array() == "[]");
113 | }


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.9)
  2 | project(csv)
  3 | 
  4 | if(CSV_CXX_STANDARD)
  5 | 	set(CMAKE_CXX_STANDARD ${CSV_CXX_STANDARD})
  6 | else()
  7 | 	set(CMAKE_CXX_STANDARD 17)
  8 | endif(CSV_CXX_STANDARD)
  9 | 
 10 | option(BUILD_PYTHON "Build Python Binding" OFF)
 11 | 
 12 | message("Building CSV library using C++${CMAKE_CXX_STANDARD}")
 13 | 
 14 | # Defines CSV_HAS_CXX17 in compatibility.hpp
 15 | if (CMAKE_VERSION VERSION_LESS "3.12.0")
 16 | 	add_definitions(-DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD})
 17 | else()
 18 | 	add_compile_definitions(CMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD})
 19 | endif()
 20 | 
 21 | set(THREADS_PREFER_PTHREAD_FLAG TRUE)
 22 | find_package(Threads QUIET REQUIRED)
 23 | 
 24 | if(MSVC)
 25 | 	# Make Visual Studio report accurate C++ version
 26 | 	# See: https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/
 27 |   # /Wall emits warnings about the C++ standard library
 28 | 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /GS- /Zc:__cplusplus /W4")
 29 | else()
 30 | 	# Ignore Visual Studio pragma regions
 31 | 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
 32 |   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} --coverage -Og")
 33 | endif(MSVC)
 34 | 
 35 | set(CSV_ROOT_DIR ${CMAKE_CURRENT_LIST_DIR})
 36 | set(CSV_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR})
 37 | set(CSV_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/include/)
 38 | set(CSV_SOURCE_DIR ${CSV_INCLUDE_DIR}/internal/)
 39 | set(CSV_TEST_DIR ${CMAKE_CURRENT_LIST_DIR}/tests)
 40 | 
 41 | include_directories(${CSV_INCLUDE_DIR})
 42 | 
 43 | ## Load developer specific CMake settings
 44 | if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
 45 |     SET(CSV_DEVELOPER TRUE)
 46 | endif()
 47 | 
 48 | ## Main Library
 49 | add_subdirectory(${CSV_SOURCE_DIR})
 50 | 
 51 | # build the python binding for the library 
 52 | if (${BUILD_PYTHON})
 53 |     message("Building Python bindings for the library.")
 54 |     add_subdirectory(python)
 55 | endif()
 56 | 
 57 | ## Executables
 58 | option(CSV_BUILD_PROGRAMS "Allow to disable building of programs" ON)
 59 | if (CSV_BUILD_PROGRAMS)
 60 |     add_subdirectory("programs")
 61 | endif()
 62 | 
 63 | ## Developer settings
 64 | if (CSV_DEVELOPER)
 65 |     # Allow for performance profiling
 66 |     if (MSVC)
 67 | 	    target_link_options(csv PUBLIC /PROFILE)
 68 |     endif()
 69 |      
 70 |     # More error messages.
 71 |     if (UNIX)
 72 |       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
 73 |         -Wall -Wextra -Wsign-compare \
 74 |         -Wwrite-strings -Wpointer-arith -Winit-self \
 75 |         -Wconversion -Wno-sign-conversion")
 76 |     endif()
 77 | 
 78 |     # Generate a single header library
 79 |     if(CMAKE_VERSION VERSION_LESS "3.12")
 80 |       find_package(PythonInterp 3 QUIET)
 81 |     else()
 82 |       find_package(Python3 COMPONENTS Interpreter)
 83 |     endif()
 84 |     if(Python3_Interpreter_FOUND OR PYTHONINTERP_FOUND)
 85 |       add_custom_target(generate_single_header
 86 |           COMMAND ${Python3_EXECUTABLE} single_header.py > single_include/csv.hpp
 87 |           COMMAND ${Python3_EXECUTABLE} single_header.py > single_include_test/csv.hpp
 88 |           WORKING_DIRECTORY ${CSV_ROOT_DIR}
 89 |       )
 90 |       # Single header compilation test
 91 |       add_subdirectory(single_include_test)
 92 |     else()
 93 |       message(WARNING "Python3 not found, skipping target 'generate_single_header'.")
 94 |     endif()
 95 | 
 96 |     # Documentation
 97 |     find_package(Doxygen QUIET)
 98 |     if(DOXYGEN_FOUND)
 99 |       add_custom_target(doxygen
100 |           COMMAND ${DOXYGEN_EXECUTABLE} ${CSV_ROOT_DIR}/Doxyfile
101 |           WORKING_DIRECTORY ${CSV_ROOT_DIR}
102 |       )
103 |     else()
104 |       message(WARNING "Doxygen not found, skipping target 'doxygen'.")
105 |     endif()
106 | 
107 |     ## Tests
108 |     enable_testing()
109 |     add_subdirectory("tests")
110 | 
111 |     # Code coverage
112 |     #find_program( GCOV_PATH gcov )
113 |     #if(GCOV_PATH)
114 |     #    set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/modules")
115 |     #    include(CodeCoverage)
116 |     #    append_coverage_compiler_flags()
117 |     #    set(ENV{CSV_TEST_ROOT} ${CSV_TEST_DIR})
118 |     #    setup_target_for_coverage_gcovr_html(
119 |     #      NAME csv_coverage
120 |     #      EXECUTABLE csv_test
121 |     #      EXCLUDE "tests/*"
122 |     #    )
123 |     #endif()
124 | endif()
125 | 


--------------------------------------------------------------------------------
/tests/test_csv_iterator.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Tests for the CSVRow Iterators and CSVReader Iterators
  3 | //
  4 | 
  5 | #include <catch2/catch_all.hpp>
  6 | #include "csv.hpp"
  7 | using namespace csv;
  8 | 
  9 | //////////////////////
 10 | // CSVRow Iterators //
 11 | //////////////////////
 12 | 
 13 | TEST_CASE("Test CSVRow Interator", "[test_csv_row_iter]") {
 14 |     auto rows = "A,B,C\r\n" // Header row
 15 |         "123,234,345\r\n"
 16 |         "1,2,3\r\n"
 17 |         "1,2,3"_csv;
 18 | 
 19 |     CSVRow row;
 20 |     rows.read_row(row);
 21 | 
 22 |     SECTION("Forwards and Backwards Iterators") {
 23 |         // Forwards
 24 |         REQUIRE(row.begin()->get<int>() == 123);
 25 |         REQUIRE((row.end() - 1)->get<>() == "345");
 26 | 
 27 |         size_t i = 0;
 28 |         for (auto it = row.begin(); it != row.end(); ++it) {
 29 |             if (i == 0) REQUIRE(it->get<>() == "123");
 30 |             else if (i == 1) REQUIRE(it->get<>() == "234");
 31 |             else  REQUIRE(it->get<>() == "345");
 32 | 
 33 |             i++;
 34 |         }
 35 | 
 36 |         // Backwards
 37 |         REQUIRE(row.rbegin()->get<int>() == 345);
 38 |         REQUIRE((row.rend() - 1)->get<>() == "123");
 39 |     }
 40 | 
 41 |     SECTION("Iterator Arithmetic") {
 42 |         REQUIRE(row.begin()->get<int>() == 123);
 43 |         REQUIRE((row.end() - 1)->get<>() == "345");
 44 | 
 45 |         auto row_start = row.begin();
 46 |         REQUIRE(*(row_start + 1) == "234");
 47 |         REQUIRE(*(row_start + 2) == "345");
 48 | 
 49 |     }
 50 | 
 51 |     SECTION("Post-Increment Iterator") {
 52 |         auto it = row.begin();
 53 | 
 54 |         REQUIRE(it++->get<int>() == 123);
 55 |         REQUIRE(it->get<int>() == 234);
 56 | 
 57 |         REQUIRE(it--->get<int>() == 234);
 58 |         REQUIRE(it->get<int>() == 123);
 59 |     }
 60 | 
 61 |     SECTION("Range Based For") {
 62 |         size_t i = 0;
 63 |         for (auto& field : row) {
 64 |             if (i == 0) REQUIRE(field.get<>() == "123");
 65 |             else if (i == 1) REQUIRE(field.get<>() == "234");
 66 |             else  REQUIRE(field.get<>() == "345");
 67 | 
 68 |             i++;
 69 |         }
 70 |     }
 71 | }
 72 | 
 73 | /////////////////////////
 74 | // CSVReader Iterators //
 75 | /////////////////////////
 76 | 
 77 | //! [CSVReader Iterator 1]
 78 | TEST_CASE("Basic CSVReader Iterator Test", "[read_ints_iter]") {
 79 |     // A file with 100 rows and columns A, B, ... J
 80 |     // where every value in the ith row is the number i
 81 |     CSVReader reader("./tests/data/fake_data/ints.csv");
 82 |     std::vector<std::string> col_names = {
 83 |         "A", "B", "C", "D", "E", "F", "G", "H", "I", "J"
 84 |     };
 85 |     int i = 1;
 86 | 
 87 |     SECTION("Basic Iterator") {
 88 |         for (auto it = reader.begin(); it != reader.end(); ++it) {
 89 |             REQUIRE((*it)[0].get<int>() == i);
 90 |             i++;
 91 |         }
 92 |     }
 93 | 
 94 |     SECTION("Iterator Post-Increment") {
 95 |         auto it = reader.begin();
 96 |         REQUIRE((it++)->operator[]("A").get<int>() == 1);
 97 |         REQUIRE(it->operator[]("A").get<int>() == 2);
 98 |     }
 99 | 
100 |     SECTION("Range-Based For Loop") {
101 |         for (auto& row : reader) {
102 |             for (auto& j : col_names) REQUIRE(row[j].get<int>() == i);
103 |             i++;
104 |         }
105 |     }
106 | }
107 | //! [CSVReader Iterator 1]
108 | 
109 | //! [CSVReader Iterator 2]
110 | TEST_CASE("CSVReader Iterator + std::max_elem", "[iter_max_elem]") {
111 |     // The first is such that each value in the ith row is the number i
112 |     // There are 100 rows
113 |     // The second file is a database of California state employee salaries
114 |     CSVReader r1("./tests/data/fake_data/ints.csv"),
115 |         r2("./tests/data/real_data/2015_StateDepartment.csv");
116 | 
117 |     // Find largest number
118 |     auto int_finder = [](CSVRow& left, CSVRow& right) {
119 |         return (left["A"].get<int>() < right["A"].get<int>());
120 |     };
121 | 
122 |     auto max_int = std::max_element(r1.begin(), r2.end(), int_finder);
123 | 
124 |     // Find highest salary
125 |     auto wage_finder = [](CSVRow& left, CSVRow& right) {
126 |         return (left["Total Wages"].get<double>() < right["Total Wages"].get<double>());
127 |     };
128 | 
129 |     auto max_wage = std::max_element(r2.begin(), r2.end(), wage_finder);
130 | 
131 |     REQUIRE((*max_int)["A"] == 100);
132 |     REQUIRE((*max_wage)["Total Wages"] == "812064.87");
133 | }
134 | //! [CSVReader Iterator 2]
135 | 


--------------------------------------------------------------------------------
/tests/test_write_csv.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h> // For remove()
  2 | #include <sstream>
  3 | #include <queue>
  4 | #include <list>
  5 | #include <catch2/catch_all.hpp>
  6 | #include "csv.hpp"
  7 | 
  8 | using namespace csv;
  9 | using std::queue;
 10 | using std::vector;
 11 | using std::string;
 12 | 
 13 | #ifndef __clang__
 14 | TEST_CASE("Numeric Converter Tsts", "[test_convert_number]") {
 15 |     SECTION("num_digits") {
 16 |         REQUIRE(csv::internals::num_digits(99.0) == 2);
 17 |         REQUIRE(csv::internals::num_digits(100.0) == 3);
 18 |     }
 19 | 
 20 |     SECTION("Large Numbers") {
 21 |         // Large numbers: integer larger than uint64 capacity
 22 |         REQUIRE(csv::internals::to_string(200000000000000000000.0) == "200000000000000000000.0");
 23 |         REQUIRE(csv::internals::to_string(310000000000000000000.0) == "310000000000000000000.0");
 24 |     }
 25 | 
 26 |     SECTION("Custom Precision") {
 27 |         // Test setting precision
 28 |         REQUIRE(csv::internals::to_string(1.234) == "1.23400");
 29 |         REQUIRE(csv::internals::to_string(20.0045) == "20.00450");
 30 | 
 31 |         set_decimal_places(2);
 32 |         REQUIRE(csv::internals::to_string(1.234) == "1.23");
 33 | 
 34 |         // Reset
 35 |         set_decimal_places(5);
 36 |     }
 37 | 
 38 |     SECTION("Decimal Numbers x where -1 < x < 0") {
 39 |         REQUIRE(csv::internals::to_string(-0.25) == "-0.25000");
 40 |         REQUIRE(csv::internals::to_string(-0.625) == "-0.62500");
 41 |         REQUIRE(csv::internals::to_string(-0.666) == "-0.66600");
 42 |     }
 43 | 
 44 |     SECTION("Numbers Close to 10^n - Regression") {
 45 |         REQUIRE(csv::internals::to_string(10.0) == "10.0");
 46 |         REQUIRE(csv::internals::to_string(100.0) == "100.0");
 47 |         REQUIRE(csv::internals::to_string(1000.0) == "1000.0");
 48 |         REQUIRE(csv::internals::to_string(10000.0) == "10000.0");
 49 |         REQUIRE(csv::internals::to_string(100000.0) == "100000.0");
 50 |         REQUIRE(csv::internals::to_string(1000000.0) == "1000000.0");
 51 |     }
 52 | }
 53 | #endif
 54 | 
 55 | TEST_CASE("Basic CSV Writing Cases", "[test_csv_write]") {
 56 |     std::stringstream output, correct;
 57 |     auto writer = make_csv_writer(output);
 58 | 
 59 |     SECTION("Escaped Comma") {
 60 |         writer << std::array<std::string, 1>({ "Furthermore, this should be quoted." });
 61 |         correct << "\"Furthermore, this should be quoted.\"";
 62 |     }
 63 | 
 64 |     SECTION("Quote Escape") {
 65 |         writer << std::array<std::string, 1>({ "\"What does it mean to be RFC 4180 compliant?\" she asked." });
 66 |         correct << "\"\"\"What does it mean to be RFC 4180 compliant?\"\" she asked.\"";
 67 |     }
 68 | 
 69 |     SECTION("Newline Escape") {
 70 |         writer << std::array<std::string, 1>({ "Line 1\nLine2" });
 71 |         correct << "\"Line 1\nLine2\"";
 72 |     }
 73 | 
 74 |     SECTION("Leading and Trailing Quote Escape") {
 75 |         writer << std::array<std::string, 1>({ "\"\"" });
 76 |         correct << "\"\"\"\"\"\"";
 77 |     }
 78 | 
 79 |     SECTION("Quote Minimal") {
 80 |         writer << std::array<std::string, 1>({ "This should not be quoted" });
 81 |         correct << "This should not be quoted";
 82 |     }
 83 | 
 84 |     correct << std::endl;
 85 |     REQUIRE(output.str() == correct.str());
 86 | }
 87 | 
 88 | TEST_CASE("CSV Quote All", "[test_csv_quote_all]") {
 89 |     std::stringstream output, correct;
 90 |     auto writer = make_csv_writer(output, false);
 91 | 
 92 |     writer << std::array<std::string, 1>({ "This should be quoted" });
 93 |     correct << "\"This should be quoted\"" << std::endl;
 94 | 
 95 |     REQUIRE(output.str() == correct.str());
 96 | }
 97 | 
 98 | //! [CSV Writer Example]
 99 | TEMPLATE_TEST_CASE("CSV/TSV Writer - operator <<", "[test_csv_operator<<]",
100 |     std::vector<std::string>, std::deque<std::string>, std::list<std::string>) {
101 |     std::stringstream output, correct_comma, correct_tab;
102 | 
103 |     // Build correct strings
104 |     correct_comma << "A,B,C" << std::endl << "\"1,1\",2,3" << std::endl;
105 |     correct_tab << "A\tB\tC" << std::endl << "1,1\t2\t3" << std::endl;
106 | 
107 |     // Test input
108 |     auto test_row_1 = TestType({ "A", "B", "C" }),
109 |         test_row_2 = TestType({ "1,1", "2", "3" });
110 | 
111 |     SECTION("CSV Writer") {
112 |         auto csv_writer = make_csv_writer(output);
113 |         csv_writer << test_row_1 << test_row_2;
114 | 
115 |         REQUIRE(output.str() == correct_comma.str());
116 |     }
117 | 
118 |     SECTION("TSV Writer") {
119 |         auto tsv_writer = make_tsv_writer(output);
120 |         tsv_writer << test_row_1 << test_row_2;
121 | 
122 |         REQUIRE(output.str() == correct_tab.str());
123 |     }
124 | }
125 | //! [CSV Writer Example]
126 | 
127 | //! [CSV Writer Tuple Example]
128 | struct Time {
129 |     std::string hour;
130 |     std::string minute;
131 | 
132 |     operator std::string() const {
133 |         std::string ret = hour;
134 |         ret += ":";
135 |         ret += minute;
136 |         
137 |         return ret;
138 |     }
139 | };
140 | 
141 | #ifndef __clang__
142 | TEST_CASE("CSV Tuple", "[test_csv_tuple]") {
143 |     #ifdef CSV_HAS_CXX17
144 |     Time time = { "5", "30" };
145 |     #else
146 |     std::string time = "5:30";
147 |     #endif
148 |     std::stringstream output, correct_output;
149 |     auto csv_writer = make_csv_writer(output);
150 | 
151 |     csv_writer << std::make_tuple("One", 2, "Three", 4.0, time)
152 |         << std::make_tuple("One", (short)2, "Three", 4.0f, time)
153 |         << std::make_tuple(-1, -2.0)
154 |         << std::make_tuple(20.2, -20.3, -20.123)
155 |         << std::make_tuple(0.0, 0.0f, 0);
156 | 
157 |     correct_output << "One,2,Three,4.0,5:30" << std::endl
158 |         << "One,2,Three,4.0,5:30" << std::endl
159 |         << "-1,-2.0" << std::endl
160 |         << "20.19999,-20.30000,-20.12300" << std::endl
161 |         << "0.0,0.0,0" << std::endl;
162 | 
163 |     REQUIRE(output.str() == correct_output.str());
164 | }
165 | #endif
166 | //! [CSV Writer Tuple Example]
167 | 


--------------------------------------------------------------------------------
/include/internal/csv_format.hpp:
--------------------------------------------------------------------------------
  1 | /** @file
  2 |  *  Defines an object used to store CSV format settings
  3 |  */
  4 | 
  5 | #pragma once
  6 | #include <iterator>
  7 | #include <stdexcept>
  8 | #include <string>
  9 | #include <vector>
 10 | 
 11 | #include "common.hpp"
 12 | 
 13 | namespace csv {
 14 |     namespace internals {
 15 |         class IBasicCSVParser;
 16 |     }
 17 | 
 18 |     class CSVReader;
 19 | 
 20 |     /** Determines how to handle rows that are shorter or longer than the majority */
 21 |     enum class VariableColumnPolicy {
 22 |         THROW = -1,
 23 |         IGNORE_ROW = 0,
 24 |         KEEP   = 1
 25 |     };
 26 | 
 27 |     /** Stores the inferred format of a CSV file. */
 28 |     struct CSVGuessResult {
 29 |         char delim;
 30 |         int header_row;
 31 |     };
 32 | 
 33 |     /** Stores information about how to parse a CSV file.
 34 |      *  Can be used to construct a csv::CSVReader. 
 35 |      */
 36 |     class CSVFormat {
 37 |     public:
 38 |         /** Settings for parsing a RFC 4180 CSV file */
 39 |         CSVFormat() = default;
 40 | 
 41 |         /** Sets the delimiter of the CSV file
 42 |          *
 43 |          *  @throws `std::runtime_error` thrown if trim, quote, or possible delimiting characters overlap
 44 |          */
 45 |         CSVFormat& delimiter(char delim);
 46 | 
 47 |         /** Sets a list of potential delimiters
 48 |          *  
 49 |          *  @throws `std::runtime_error` thrown if trim, quote, or possible delimiting characters overlap
 50 |          *  @param[in] delim An array of possible delimiters to try parsing the CSV with
 51 |          */
 52 |         CSVFormat& delimiter(const std::vector<char> & delim);
 53 | 
 54 |         /** Sets the whitespace characters to be trimmed
 55 |          *
 56 |          *  @throws `std::runtime_error` thrown if trim, quote, or possible delimiting characters overlap
 57 |          *  @param[in] ws An array of whitespace characters that should be trimmed
 58 |          */
 59 |         CSVFormat& trim(const std::vector<char> & ws);
 60 | 
 61 |         /** Sets the quote character
 62 |          *
 63 |          *  @throws `std::runtime_error` thrown if trim, quote, or possible delimiting characters overlap
 64 |          */
 65 |         CSVFormat& quote(char quote);
 66 | 
 67 |         /** Sets the column names.
 68 |          *
 69 |          *  @note Unsets any values set by header_row()
 70 |          */
 71 |         CSVFormat& column_names(const std::vector<std::string>& names);
 72 | 
 73 |         /** Sets the header row
 74 |          *
 75 |          *  @note Unsets any values set by column_names()
 76 |          */
 77 |         CSVFormat& header_row(int row);
 78 | 
 79 |         /** Tells the parser that this CSV has no header row
 80 |          *
 81 |          *  @note Equivalent to `header_row(-1)`
 82 |          *
 83 |          */
 84 |         CSVFormat& no_header() {
 85 |             this->header_row(-1);
 86 |             return *this;
 87 |         }
 88 | 
 89 |         /** Turn quoting on or off */
 90 |         CSVFormat& quote(bool use_quote) {
 91 |             this->no_quote = !use_quote;
 92 |             return *this;
 93 |         }
 94 | 
 95 |         /** Tells the parser how to handle columns of a different length than the others */
 96 |         CONSTEXPR_14 CSVFormat& variable_columns(VariableColumnPolicy policy = VariableColumnPolicy::IGNORE_ROW) {
 97 |             this->variable_column_policy = policy;
 98 |             return *this;
 99 |         }
100 | 
101 |         /** Tells the parser how to handle columns of a different length than the others */
102 |         CONSTEXPR_14 CSVFormat& variable_columns(bool policy) {
103 |             this->variable_column_policy = (VariableColumnPolicy)policy;
104 |             return *this;
105 |         }
106 | 
107 |         #ifndef DOXYGEN_SHOULD_SKIP_THIS
108 |         char get_delim() const {
109 |             // This error should never be received by end users.
110 |             if (this->possible_delimiters.size() > 1) {
111 |                 throw std::runtime_error("There is more than one possible delimiter.");
112 |             }
113 | 
114 |             return this->possible_delimiters.at(0);
115 |         }
116 | 
117 |         CONSTEXPR bool is_quoting_enabled() const { return !this->no_quote; }
118 |         CONSTEXPR char get_quote_char() const { return this->quote_char; }
119 |         CONSTEXPR int get_header() const { return this->header; }
120 |         std::vector<char> get_possible_delims() const { return this->possible_delimiters; }
121 |         std::vector<char> get_trim_chars() const { return this->trim_chars; }
122 |         CONSTEXPR VariableColumnPolicy get_variable_column_policy() const { return this->variable_column_policy; }
123 |         #endif
124 |         
125 |         /** CSVFormat for guessing the delimiter */
126 |         CSV_INLINE static CSVFormat guess_csv() {
127 |             CSVFormat format;
128 |             format.delimiter({ ',', '|', '\t', ';', '^' })
129 |                 .quote('"')
130 |                 .header_row(0);
131 | 
132 |             return format;
133 |         }
134 | 
135 |         bool guess_delim() {
136 |             return this->possible_delimiters.size() > 1;
137 |         }
138 | 
139 |         friend CSVReader;
140 |         friend internals::IBasicCSVParser;
141 |         
142 |     private:
143 |         /**< Throws an error if delimiters and trim characters overlap */
144 |         void assert_no_char_overlap();
145 | 
146 |         /**< Set of possible delimiters */
147 |         std::vector<char> possible_delimiters = { ',' };
148 | 
149 |         /**< Set of whitespace characters to trim */
150 |         std::vector<char> trim_chars = {};
151 | 
152 |         /**< Row number with columns (ignored if col_names is non-empty) */
153 |         int header = 0;
154 | 
155 |         /**< Whether or not to use quoting */
156 |         bool no_quote = false;
157 | 
158 |         /**< Quote character */
159 |         char quote_char = '"';
160 | 
161 |         /**< Should be left empty unless file doesn't include header */
162 |         std::vector<std::string> col_names = {};
163 | 
164 |         /**< Allow variable length columns? */
165 |         VariableColumnPolicy variable_column_policy = VariableColumnPolicy::IGNORE_ROW;
166 |     };
167 | }


--------------------------------------------------------------------------------
/tests/test_data_type.cpp:
--------------------------------------------------------------------------------
  1 | #include <catch2/catch_all.hpp>
  2 | #include "csv.hpp"
  3 | #include <string>
  4 | 
  5 | #include "./shared/float_test_cases.hpp"
  6 | 
  7 | using namespace csv;
  8 | using namespace csv::internals;
  9 | 
 10 | TEST_CASE( "Recognize Integers Properly", "[dtype_int]" ) {
 11 |     std::string a("1"), b(" 2018   "), c(" -69 ");
 12 |     long double out = 0;
 13 | 
 14 |     REQUIRE(data_type(a, &out) == DataType::CSV_INT8);
 15 |     REQUIRE(out == 1);
 16 | 
 17 |     REQUIRE(data_type(b, &out) == DataType::CSV_INT16);
 18 |     REQUIRE(out == 2018);
 19 | 
 20 |     REQUIRE(data_type(c, &out) == DataType::CSV_INT8);
 21 |     REQUIRE(out == -69);
 22 | }
 23 | 
 24 | TEST_CASE( "Recognize Strings Properly", "[dtype_str]" ) {
 25 |     auto str = GENERATE(as<std::string> {}, "test", "999.999.9999", "510-123-4567", "510 123", "510 123 4567");
 26 | 
 27 |     SECTION("String Recognition") {
 28 |         REQUIRE(data_type(str) == DataType::CSV_STRING);
 29 |     }
 30 | }
 31 | 
 32 | TEST_CASE( "Recognize Null Properly", "[dtype_null]" ) {
 33 |     std::string null_str("");
 34 |     REQUIRE( data_type(null_str) == DataType::CSV_NULL );
 35 | }
 36 | 
 37 | TEST_CASE( "Recognize Floats Properly", "[dtype_float]" ) {
 38 |     using std::make_tuple;
 39 | 
 40 |     SECTION("Parse One Float") {
 41 |         std::string input;
 42 |         long double out = 0;
 43 |         long double expected = 0;
 44 | 
 45 |         std::tie(input, expected) =
 46 |             GENERATE(table<std::string, long double>(
 47 |                 csv_test::FLOAT_TEST_CASES));
 48 | 
 49 |         REQUIRE(data_type(input, &out) == DataType::CSV_DOUBLE);
 50 |         REQUIRE(is_equal(out, expected));
 51 |     }
 52 | }
 53 | 
 54 | TEST_CASE("Integer Size Recognition", "[int_sizes]") {
 55 |     std::string s;
 56 |     long double out = 0;
 57 | 
 58 |     SECTION("Boundary Values") {
 59 |         s = std::to_string((long long)csv::internals::CSV_INT8_MAX);
 60 |         REQUIRE(data_type(s, &out) == DataType::CSV_INT8);
 61 |         REQUIRE(out == (long long)CSV_INT8_MAX);
 62 | 
 63 |         s = std::to_string((long long)csv::internals::CSV_INT16_MAX);
 64 |         REQUIRE(data_type(s, &out) == DataType::CSV_INT16);
 65 |         REQUIRE(out == (long long)CSV_INT16_MAX);
 66 | 
 67 |         s = std::to_string((long long)csv::internals::CSV_INT32_MAX);
 68 |         REQUIRE(data_type(s, &out) == DataType::CSV_INT32);
 69 |         REQUIRE(out == (long long)CSV_INT32_MAX);
 70 | 
 71 |         // Note: data_type() doesn't have enough precision for CSV_INT64
 72 |     }
 73 | 
 74 |     SECTION("Integer Overflow") {
 75 |         s = std::to_string((long long)csv::internals::CSV_INT16_MAX + 1);
 76 |         REQUIRE(data_type(s, &out) == DataType::CSV_INT32);
 77 |         REQUIRE(out == (long long)CSV_INT16_MAX + 1);
 78 | 
 79 |         s = std::to_string((long long)csv::internals::CSV_INT32_MAX + 1);
 80 |         REQUIRE(data_type(s, &out) == DataType::CSV_INT64);
 81 |         REQUIRE(out == (long long)CSV_INT32_MAX + 1);
 82 | 
 83 |         // Case: Integer too large to fit in int64 --> store in long double
 84 |         s = std::to_string((long long)csv::internals::CSV_INT64_MAX);
 85 |         s.append("1");
 86 |         REQUIRE(data_type(s, &out) == DataType::CSV_BIGINT);
 87 |     }
 88 | }
 89 | 
 90 | TEST_CASE( "Recognize Sub-Unit Double Values", "[regression_double]" ) {
 91 |     std::string s("0.15");
 92 |     long double out = 0;
 93 |     REQUIRE(data_type(s, &out) == DataType::CSV_DOUBLE);
 94 |     REQUIRE(is_equal(out, 0.15L));
 95 | }
 96 | 
 97 | TEST_CASE( "Recognize Double Values", "[regression_double2]" ) {
 98 |     // Test converting double values back and forth
 99 |     long double out = -1.0;
100 |     std::string s;
101 | 
102 |     for (long double i = 0; i <= 2.0; i += 0.01) {
103 |         s = std::to_string(i);
104 |         REQUIRE(data_type(s, &out) == DataType::CSV_DOUBLE);
105 |         REQUIRE(is_equal(out, i));
106 |     }
107 | }
108 | 
109 | //! [Parse Scientific Notation]
110 | TEST_CASE("Parse Scientific Notation", "[e_notation]") {
111 |     // Test parsing e notation
112 |     long double out = 0;
113 | 
114 |     REQUIRE(data_type("1E-06", &out) == DataType::CSV_DOUBLE);
115 |     REQUIRE(is_equal(out, 0.000001L));
116 | 
117 |     REQUIRE(data_type("1e-06", &out) == DataType::CSV_DOUBLE);
118 |     REQUIRE(is_equal(out, 0.000001L));
119 | 
120 |     REQUIRE(data_type("2.17222E+02", &out) == DataType::CSV_DOUBLE);
121 |     REQUIRE(is_equal(out, 217.222L));
122 | 
123 |     REQUIRE(data_type("4.55E+10", &out) == DataType::CSV_DOUBLE);
124 |     REQUIRE(is_equal(out, 45500000000.0L));
125 | 
126 |     REQUIRE(data_type("4.55E+11", &out) == DataType::CSV_DOUBLE);
127 |     REQUIRE(is_equal(out, 455000000000.0L));
128 | 
129 |     REQUIRE(data_type("4.55E-1", &out) == DataType::CSV_DOUBLE);
130 |     REQUIRE(is_equal(out, 0.455L));
131 | 
132 |     REQUIRE(data_type("4.55E-5", &out) == DataType::CSV_DOUBLE);
133 |     REQUIRE(is_equal(out, 0.0000455L));
134 | 
135 |     REQUIRE(data_type("4.55E-000000000005", &out) == DataType::CSV_DOUBLE);
136 |     REQUIRE(is_equal(out, 0.0000455L));
137 | }
138 | //! [Parse Scientific Notation]
139 | 
140 | //! [Scientific Notation Flavors]
141 | TEST_CASE("Parse Different Flavors of Scientific Notation", "[sci_notation_diversity]") {
142 |     auto number = GENERATE(as<std::string> {},
143 |         "4.55e5", "4.55E5",
144 |         "4.55E+5", "4.55e+5",
145 |         "4.55E+05",
146 |         "4.55e0000005", "4.55E0000005",
147 |         "4.55e+0000005", "4.55E+0000005");
148 | 
149 |     SECTION("Recognize 455 thousand") {
150 |         long double out = 0;
151 |         REQUIRE(data_type(number, &out) == DataType::CSV_DOUBLE);
152 |         REQUIRE(is_equal(out, 455000.0L));
153 |     }
154 | }
155 | //! [Scientific Notation Flavors]
156 | 
157 | TEST_CASE("Parse Scientific Notation Malformed", "[sci_notation]") {
158 |     // Assert parsing butchered scientific notation won't cause a 
159 |     // crash or any other weird side effects
160 |     auto butchered = GENERATE(as<std::string>{},
161 |         "4.55E000a",
162 |         "4.55000x40",
163 |         "4.55000E40E40");
164 | 
165 |     SECTION("Butchered Parsing Attempt") {
166 |         REQUIRE(data_type(butchered) == DataType::CSV_STRING);
167 |     }
168 | }
169 | 
170 | TEST_CASE( "Parse numbers with dash as string", "[regression_double]" ) {
171 |   std::string s("510-123-4567");
172 |   long double out = 0;
173 |   REQUIRE(data_type(s, &out) == DataType::CSV_STRING);
174 | }
175 | 


--------------------------------------------------------------------------------
/tests/test_read_csv_file.cpp:
--------------------------------------------------------------------------------
  1 | /** @file
  2 |  *  Tests for CSV parsing
  3 |  */
  4 | 
  5 | #include <stdio.h> // remove()
  6 | #include <sstream>
  7 | #include <catch2/catch_all.hpp>
  8 | #include "csv.hpp"
  9 | 
 10 | using namespace csv;
 11 | using std::vector;
 12 | using std::string;
 13 | 
 14 | TEST_CASE("col_pos() Test", "[test_col_pos]") {
 15 |     int pos = get_col_pos(
 16 |         "./tests/data/real_data/2015_StateDepartment.csv",
 17 |         "Entity Type");
 18 |     REQUIRE(pos == 1);
 19 | }
 20 | 
 21 | TEST_CASE("Prevent Column Names From Being Overwritten", "[csv_col_names_overwrite]") {
 22 |     std::vector<std::string> column_names = { "A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10" };
 23 |     
 24 |     // Test against a variety of different CSVFormat objects
 25 |     std::vector<CSVFormat> formats = {};
 26 |     formats.push_back(CSVFormat::guess_csv());
 27 |     formats.push_back(CSVFormat());
 28 |     formats.back().delimiter(std::vector<char>({ ',', '\t', '|'}));
 29 |     formats.push_back(CSVFormat());
 30 |     formats.back().delimiter(std::vector<char>({ ',', '~'}));
 31 | 
 32 |     for (auto& format_in : formats) {
 33 |         // Set up the CSVReader
 34 |         format_in.column_names(column_names);
 35 |         CSVReader reader("./tests/data/fake_data/ints_comments.csv", format_in);
 36 | 
 37 |         // Assert that column names weren't overwritten
 38 |         CSVFormat format_out = reader.get_format();
 39 |         REQUIRE(reader.get_col_names() == column_names);
 40 |         REQUIRE(format_out.get_delim() == ',');
 41 |         REQUIRE(format_out.get_header() == 5);
 42 |     }
 43 | }
 44 | 
 45 | // get_file_info()
 46 | TEST_CASE("get_file_info() Test", "[test_file_info]") {
 47 |     SECTION("ints.csv") {
 48 |         CSVFileInfo info = get_file_info(
 49 |             "./tests/data/fake_data/ints.csv");
 50 | 
 51 |         REQUIRE(info.delim == ',');
 52 |         REQUIRE(info.n_rows == 100);
 53 |     }
 54 | 
 55 |     SECTION("2009PowerStatus.txt") {
 56 |         CSVFileInfo info = get_file_info(
 57 |             "./tests/data/real_data/2009PowerStatus.txt");
 58 | 
 59 |         REQUIRE(info.delim == '|');
 60 |         REQUIRE(info.n_rows == 37960); // Can confirm with Excel
 61 |         REQUIRE(info.n_cols == 3);
 62 |         REQUIRE(info.col_names == vector<string>({ "ReportDt", "Unit", "Power" }));
 63 |     }
 64 | }
 65 | 
 66 | TEST_CASE("Non-Existent CSV", "[read_ghost_csv]") {
 67 |     // Make sure attempting to parse a non-existent CSV throws an error
 68 |     bool error_caught = false;
 69 | 
 70 |     try {
 71 |         CSVReader reader("./lochness.csv");
 72 |     }
 73 |     catch (std::runtime_error& err) {
 74 |         error_caught = true;
 75 |         REQUIRE(err.what() == std::string("Cannot open file ./lochness.csv"));
 76 |     }
 77 | 
 78 |     REQUIRE(error_caught);
 79 | }
 80 | 
 81 | TEST_CASE("Test Read CSV where file does NOT end with newline", "[test_file_info_ints2]") {
 82 |     CSVReader reader("./tests/data/fake_data/ints_doesnt_end_in_newline.csv");
 83 | 
 84 |     auto row = reader.begin();
 85 |     for (; row != reader.end(); row++) {} // skip to end
 86 | 
 87 |     REQUIRE((*row)["A"] == 100);
 88 |     REQUIRE((*row)["J"] == 100);
 89 | }
 90 | 
 91 | TEST_CASE( "Test Read CSV with Header Row", "[read_csv_header]" ) {
 92 |     // Header on first row
 93 |     constexpr auto path = "./tests/data/real_data/2015_StateDepartment.csv";
 94 | 
 95 |     // Test using memory mapped IO and std::ifstream
 96 |     std::vector<CSVReader> readers = {};
 97 |     std::ifstream infile(path, std::ios::binary);
 98 | 
 99 |     readers.emplace_back(path, CSVFormat()); // Memory mapped
100 |     readers.emplace_back(infile, CSVFormat());
101 | 
102 |     for (auto& reader : readers) {
103 |         CSVRow row;
104 |         reader.read_row(row); // Populate row with first line
105 | 
106 |         // Expected Results
107 |         vector<string> col_names = {
108 |             "Year", "Entity Type", "Entity Group", "Entity Name",
109 |             "Department / Subdivision", "Position", "Elected Official",
110 |             "Judicial", "Other Positions", "Min Classification Salary",
111 |             "Max Classification Salary", "Reported Base Wage", "Regular Pay",
112 |             "Overtime Pay", "Lump-Sum Pay", "Other Pay", "Total Wages",
113 |             "Defined Benefit Plan Contribution", "Employees Retirement Cost Covered",
114 |             "Deferred Compensation Plan", "Health Dental Vision",
115 |             "Total Retirement and Health Cost", "Pension Formula",
116 |             "Entity URL", "Entity Population", "Last Updated",
117 |             "Entity County", "Special District Activities"
118 |         };
119 | 
120 |         vector<string> first_row = {
121 |             "2015","State Department","","Administrative Law, Office of","",
122 |             "Assistant Chief Counsel","False","False","","112044","129780",""
123 |             ,"133020.06","0","2551.59","2434.8","138006.45","34128.65","0","0"
124 |             ,"15273.97","49402.62","2.00% @ 55","http://www.spb.ca.gov/","",
125 |             "08/02/2016","",""
126 |         };
127 | 
128 |         REQUIRE(vector<string>(row) == first_row);
129 |         REQUIRE(reader.get_col_names() == col_names);
130 | 
131 |         // Skip to end
132 |         while (reader.read_row(row));
133 |         REQUIRE(reader.n_rows() == 246497);
134 |     }
135 | }
136 | 
137 | //
138 | // read_row()
139 | //
140 | //! [CSVField Example]
141 | TEST_CASE("Test read_row() CSVField - Easy", "[read_row_csvf1]") {
142 |     // Test that integers are type-casted properly
143 |     CSVReader reader("./tests/data/fake_data/ints.csv");
144 |     CSVRow row;
145 | 
146 |     while (reader.read_row(row)) {
147 |         for (size_t i = 0; i < row.size(); i++) {
148 |             REQUIRE(row[i].is_int());
149 |             REQUIRE(row[i].get<int>() <= 100);
150 |         }
151 |     }
152 | }
153 | //! [CSVField Example]
154 | 
155 | TEST_CASE("Test read_row() CSVField - Power Status", "[read_row_csvf3]") {
156 |     CSVReader reader("./tests/data/real_data/2009PowerStatus.txt");
157 |     CSVRow row;
158 | 
159 |     size_t date = reader.index_of("ReportDt"),
160 |         unit = reader.index_of("Unit"),
161 |         power = reader.index_of("Power");
162 |     
163 |     // Try to find a non-existent column
164 |     REQUIRE(reader.index_of("metallica") == CSV_NOT_FOUND);
165 | 
166 |     for (size_t i = 0; reader.read_row(row); i++) {
167 |         // Assert correct types
168 |         REQUIRE(row[date].is_str());
169 |         REQUIRE(row[unit].is_str());
170 |         REQUIRE(row[power].is_int());
171 | 
172 |         // Spot check
173 |         if (i == 2) {
174 |             REQUIRE(row[power].get<int>() == 100);
175 |             REQUIRE(row[date].get<>() == "12/31/2009"); // string_view
176 |             REQUIRE(row[unit].get<std::string>() == "Beaver Valley 1");
177 |         }
178 |     }
179 | }


--------------------------------------------------------------------------------
/tests/test_csv_field.cpp:
--------------------------------------------------------------------------------
  1 | #include "csv.hpp"
  2 | #include <catch2/catch_all.hpp>
  3 | #include <cmath>
  4 | #include <iostream>
  5 | 
  6 | using namespace csv;
  7 | 
  8 | #include "./shared/float_test_cases.hpp"
  9 | 
 10 | TEMPLATE_TEST_CASE("CSVField get<> - String Value", "[test_csv_field_get_string]",
 11 |     signed char, short int, int, long long int, double, long double) {
 12 |     CSVField field("applesauce");
 13 |     REQUIRE(field.get<>() == "applesauce");
 14 | 
 15 |     // Assert that improper conversions attempts are thwarted
 16 |     bool ex_caught = false;
 17 |     try {
 18 |         field.get<TestType>();
 19 |     }
 20 |     catch (std::runtime_error& err) {
 21 |         REQUIRE(err.what() == csv::internals::ERROR_NAN);
 22 |         ex_caught = true;
 23 |     }
 24 | 
 25 |     REQUIRE(ex_caught);
 26 | }
 27 | 
 28 | TEST_CASE("CSVField get<> - Error Messages", "[test_csv_field_get_error]") {
 29 |     CSVField field("applesauce");
 30 |     
 31 |     bool ex_caught = false;
 32 |     try {
 33 |         field.get<double>();
 34 |     }
 35 |     catch (std::runtime_error& err) {
 36 |         REQUIRE(err.what() == csv::internals::ERROR_NAN);
 37 |         ex_caught = true;
 38 |     }
 39 | 
 40 |     REQUIRE(ex_caught);
 41 | }
 42 | 
 43 | TEST_CASE("CSVField get<>() - Integral Value", "[test_csv_field_get_int]") {
 44 |     CSVField this_year("2019");
 45 |     REQUIRE(this_year.get<>() == "2019");
 46 |     REQUIRE(this_year.get<csv::string_view>() == "2019");
 47 |     REQUIRE(this_year.get<int>() == 2019);
 48 |     REQUIRE(this_year.get<long long int>() == 2019);
 49 |     REQUIRE(this_year.get<float>() == 2019.0f);
 50 |     REQUIRE(this_year.get<double>() == 2019.0);
 51 |     REQUIRE(this_year.get<long double>() == 2019l);
 52 | 
 53 |     bool ex_caught = false;
 54 |     try {
 55 |         this_year.get<signed char>();
 56 |     }
 57 |     catch (std::runtime_error& err) {
 58 |         REQUIRE(err.what() == csv::internals::ERROR_OVERFLOW);
 59 |         ex_caught = true;
 60 |     }
 61 | 
 62 |     REQUIRE(ex_caught);
 63 | }
 64 | 
 65 | TEST_CASE("CSVField get<>() - Integer Boundary Value", "[test_csv_field_get_boundary]") {
 66 |     // Note: Tests may fail if compiler defines typenames differently than
 67 |     // Microsoft/GCC/clang
 68 |     REQUIRE(CSVField("127").get<signed char>() == 127);
 69 |     REQUIRE(CSVField("32767").get<short>() == 32767);
 70 |     REQUIRE(CSVField("2147483647").get<int>() == 2147483647);
 71 | 
 72 |     REQUIRE(CSVField("255").get<unsigned char>() == 255);
 73 |     REQUIRE(CSVField("65535").get<unsigned short>() == 65535);
 74 |     REQUIRE(CSVField("4294967295").get<unsigned>() == 4294967295);
 75 | }
 76 | 
 77 | // Test converting a small integer to unsigned and signed integer types
 78 | TEMPLATE_TEST_CASE("CSVField get<>() - Integral Value to Int", "[test_csv_field_convert_int]",
 79 |     unsigned char, unsigned short, unsigned int, unsigned long long,
 80 |     char, short, int, long long int) {
 81 |     CSVField savage("21");
 82 |     REQUIRE(savage.get<TestType>() == 21);
 83 | }
 84 | 
 85 | TEST_CASE("CSVField get<>() - Floating Point Value", "[test_csv_field_get_float]") {
 86 |     SECTION("Test get() with various float types") {
 87 |         CSVField euler("2.718");
 88 |         REQUIRE(euler.get<>() == "2.718");
 89 |         REQUIRE(euler.get<csv::string_view>() == "2.718");
 90 |         REQUIRE(euler.get<float>() == 2.718f);
 91 |         REQUIRE(euler.get<double>() == 2.718);
 92 |         REQUIRE(euler.get<long double>() == 2.718l);
 93 |     }
 94 | 
 95 |     SECTION("Test get() with various values") {
 96 |         std::string input;
 97 |         long double expected = 0;
 98 | 
 99 |         std::tie(input, expected) =
100 |             GENERATE(table<std::string, long double>(
101 |                 csv_test::FLOAT_TEST_CASES));
102 | 
103 |         CSVField testField(input);
104 | 
105 |         REQUIRE(internals::is_equal(testField.get<long double>(), expected));
106 |     }
107 | }
108 | 
109 | TEST_CASE("CSVField try_parse_hex()", "[test_csv_field_parse_hex]") {
110 |     int value = 0;
111 | 
112 |     SECTION("Valid Hex Values") {
113 |         std::unordered_map<std::string, int> test_cases = {
114 |             {"  A   ", 10},
115 |             {"0A", 10},
116 |             {"0B", 11},
117 |             {"0C", 12},
118 |             {"0D", 13},
119 |             {"0E", 14},
120 |             {"0F", 15},
121 |             {"FF", 255},
122 |             {"B00B5", 721077},
123 |             {"D3ADB33F", 3551376191},
124 |             {"  D3ADB33F  ", 3551376191}
125 |         };
126 | 
127 |         for (auto& _case : test_cases) {
128 |             REQUIRE(CSVField(_case.first).try_parse_hex(value));
129 |             REQUIRE(value == _case.second);
130 |         }
131 |     }
132 | 
133 |     SECTION("Invalid Values") {
134 |         std::vector<std::string> invalid_test_cases = {
135 |             "", "    ", "carneasda", "carne asada", "0fg"
136 |         };
137 | 
138 |         for (auto& _case : invalid_test_cases) {
139 |             REQUIRE(CSVField(_case).try_parse_hex(value) == false);
140 |         }
141 |     }
142 | }
143 | 
144 | 
145 | TEST_CASE("CSVField try_parse_decimal()", "[test_csv_field_parse_hex]") {
146 |     SECTION("Test try_parse_decimal() with non-numeric value") {
147 |         long double output = 0;
148 |         std::string input = "stroustrup";
149 |         CSVField testField(input);
150 | 
151 |         REQUIRE(testField.try_parse_decimal(output, ',') == false);
152 |         REQUIRE(testField.type() == DataType::CSV_STRING);
153 |     }
154 | 
155 |     SECTION("Test try_parse_decimal() with integer value") {
156 |         long double output = 0;
157 |         std::string input = "2024";
158 |         CSVField testField(input);
159 | 
160 |         REQUIRE(testField.try_parse_decimal(output, ',') == true);
161 |         REQUIRE(testField.type() == DataType::CSV_INT16);
162 |         REQUIRE(internals::is_equal(output, 2024.0l));
163 |     }
164 | 
165 |     SECTION("Test try_parse_decimal() with various valid values") {
166 |         std::string input;
167 |         long double output = 0;
168 |         long double expected = 0;
169 | 
170 |         std::tie(input, expected) =
171 |             GENERATE(table<std::string, long double>(
172 |                 csv_test::FLOAT_TEST_CASES));
173 | 
174 |         // Replace '.' with ','
175 |         std::replace(input.begin(), input.end(), '.', ',');
176 | 
177 |         CSVField testField(input);
178 | 
179 |         REQUIRE(testField.try_parse_decimal(output, ',') == true);
180 |         REQUIRE(testField.type() == DataType::CSV_DOUBLE);
181 |         REQUIRE(internals::is_equal(output, expected));
182 |     }
183 | }
184 | 
185 | TEMPLATE_TEST_CASE("CSVField get<>() - Disallow Float to Int", "[test_csv_field_get_float_as_int]",
186 |     unsigned char, unsigned short, unsigned int, unsigned long long int,
187 |     signed char, short, int, long long int) {
188 |     CSVField euler("2.718");
189 |     bool ex_caught = false;
190 | 
191 |     try {
192 |         euler.get<TestType>();
193 |     }
194 |     catch (std::runtime_error& err) {
195 |         REQUIRE(err.what() == csv::internals::ERROR_FLOAT_TO_INT);
196 |         ex_caught = true;
197 |     }
198 | 
199 |     REQUIRE(ex_caught);
200 | }
201 | 
202 | TEMPLATE_TEST_CASE("CSVField get<>() - Disallow Negative to Unsigned", "[test_csv_field_no_unsigned_neg]",
203 |     unsigned char, unsigned short, unsigned int, unsigned long long int) {
204 |     CSVField neg("-1337");
205 |     bool ex_caught = false;
206 | 
207 |     try {
208 |         neg.get<TestType>();
209 |     }
210 |     catch (std::runtime_error& err) {
211 |         REQUIRE(err.what() == csv::internals::ERROR_NEG_TO_UNSIGNED);
212 |         ex_caught = true;
213 |     }
214 | 
215 |     REQUIRE(ex_caught);
216 | }
217 | 
218 | TEST_CASE("CSVField Equality Operator", "[test_csv_field_operator==]") {
219 |     CSVField field("3.14");
220 |     REQUIRE(field == "3.14");
221 |     REQUIRE(field == 3.14f);
222 |     REQUIRE(field == 3.14);
223 | }


--------------------------------------------------------------------------------
/tests/test_raw_csv_data.cpp:
--------------------------------------------------------------------------------
  1 | #include <catch2/catch_all.hpp>
  2 | #include "internal/basic_csv_parser.hpp"
  3 | #include "internal/csv_row.hpp"
  4 | 
  5 | #include <sstream>
  6 | 
  7 | using namespace csv;
  8 | using namespace csv::internals;
  9 | using RowCollectionTest = ThreadSafeDeque<CSVRow>;
 10 | 
 11 | TEST_CASE("Basic CSV Parse Test", "[raw_csv_parse]") {
 12 |     std::stringstream csv("A,B,C\r\n"
 13 |         "123,234,345\r\n"
 14 |         "1,2,3\r\n"
 15 |         "1,2,3");
 16 | 
 17 |     RowCollectionTest rows;
 18 | 
 19 |     StreamParser<std::stringstream> parser(
 20 |         csv,
 21 |         internals::make_parse_flags(',', '"'),
 22 |         internals::WhitespaceMap()
 23 |     );
 24 | 
 25 |     parser.set_output(rows);
 26 |     parser.next();
 27 | 
 28 |     auto row = rows.front();
 29 |     REQUIRE(row[0] == "A");
 30 |     REQUIRE(row[1] == "B");
 31 |     REQUIRE(row[2] == "C");
 32 |     REQUIRE(row.size() == 3);
 33 | 
 34 |     rows.pop_front();
 35 |     row = rows.front();
 36 |     REQUIRE(row[0] == "123");
 37 |     REQUIRE(row[1] == "234");
 38 |     REQUIRE(row[2] == "345");
 39 |     REQUIRE(row.size() == 3);
 40 | 
 41 |     rows.pop_front();
 42 |     row = rows.front();
 43 |     REQUIRE(row[0] == "1");
 44 |     REQUIRE(row[1] == "2");
 45 |     REQUIRE(row[2] == "3");
 46 |     REQUIRE(row.size() == 3);
 47 | 
 48 |     rows.pop_front();
 49 |     row = rows.front();
 50 |     REQUIRE(row[0] == "1");
 51 |     REQUIRE(row[1] == "2");
 52 |     REQUIRE(row[2] == "3");
 53 |     REQUIRE(row.size() == 3);
 54 | }
 55 | 
 56 | TEST_CASE("Test Quote Escapes", "[test_parse_quote_escape]") {
 57 |     std::stringstream csv(""
 58 |         "\"A\",\"B\",\"C\"\r\n"   // Quoted fields w/ no escapes
 59 |         "123,\"234,345\",456\r\n" // Escaped comma
 60 |         "1,\"2\"\"3\",4\r\n"      // Escaped quote
 61 |         "1,\"23\"\"34\",5\r\n"      // Another escaped quote
 62 |         "1,\"\",2\r\n");           // Empty Field
 63 | 
 64 |     RowCollectionTest rows;
 65 | 
 66 |     StreamParser<std::stringstream> parser(
 67 |         csv,
 68 |         internals::make_parse_flags(',', '"'),
 69 |         internals::WhitespaceMap()
 70 |     );
 71 | 
 72 |     parser.set_output(rows);
 73 |     parser.next();
 74 | 
 75 |     auto row = rows.front();
 76 |     REQUIRE(row[0] == "A");
 77 |     REQUIRE(row[1] == "B");
 78 |     REQUIRE(row[2] == "C");
 79 |     REQUIRE(row.size() == 3);
 80 | 
 81 |     rows.pop_front();
 82 |     row = rows.front();
 83 |     REQUIRE(row[0] == "123");
 84 |     REQUIRE(row[1] == "234,345");
 85 |     REQUIRE(row[2] == "456");
 86 |     REQUIRE(row.size() == 3);
 87 | 
 88 |     rows.pop_front();
 89 |     row = rows.front();
 90 |     REQUIRE(row[0] == "1");
 91 |     REQUIRE(row[1] == "2\"3");
 92 |     REQUIRE(row[2] == "4");
 93 |     REQUIRE(row.size() == 3);
 94 | 
 95 |     rows.pop_front();
 96 |     row = rows.front();
 97 |     REQUIRE(row[0] == "1");
 98 |     REQUIRE(row[1] == "23\"34");
 99 |     REQUIRE(row[2] == "5");
100 |     REQUIRE(row.size() == 3);
101 | 
102 |     rows.pop_front();
103 |     row = rows.front();
104 |     REQUIRE(row[0] == "1");
105 |     REQUIRE(row[1] == "");
106 |     REQUIRE(row[2] == "2");
107 |     REQUIRE(row.size() == 3);
108 | }
109 | 
110 | inline std::vector<std::string> make_whitespace_test_cases() {
111 |     std::vector<std::string> test_cases = {};
112 |     std::stringstream ss;
113 | 
114 |     ss << "1, two,3" << std::endl
115 |         << "4, ,5" << std::endl
116 |         << " ,6, " << std::endl
117 |         << "7,8,9 " << std::endl;
118 |     test_cases.push_back(ss.str());
119 |     ss.clear();
120 | 
121 |     // Lots of Whitespace
122 |     ss << "1, two,3" << std::endl
123 |         << "4,                    ,5" << std::endl
124 |         << "         ,6,       " << std::endl
125 |         << "7,8,9 " << std::endl;
126 |     test_cases.push_back(ss.str());
127 |     ss.clear();
128 | 
129 |     // Same as above but there's whitespace around 6
130 |     ss << "1, two,3" << std::endl
131 |         << "4,                    ,5" << std::endl
132 |         << "         , 6 ,       " << std::endl
133 |         << "7,8,9 " << std::endl;
134 |     test_cases.push_back(ss.str());
135 |     ss.clear();
136 | 
137 |     // Tabs
138 |     ss << "1, two,3" << std::endl
139 |         << "4, \t ,5" << std::endl
140 |         << "\t\t\t\t\t ,6, \t " << std::endl
141 |         << "7,8,9 " << std::endl;
142 |     test_cases.push_back(ss.str());
143 |     ss.clear();
144 | 
145 |     return test_cases;
146 | }
147 | 
148 | TEST_CASE("Test Parser Whitespace Trimming", "[test_csv_trim]") {
149 |     auto row_str = GENERATE(as<std::string> {},
150 |         "A,B,C\r\n" // Header row
151 |         "123,\"234\n,345\",456\r\n",
152 | 
153 |         // Random spaces
154 |         "A,B,C\r\n"
155 |         "   123,\"234\n,345\",    456\r\n",
156 | 
157 |         // Random spaces + tabs
158 |         "A,B,C\r\n"
159 |         "\t\t   123,\"234\n,345\",    456\r\n",
160 | 
161 |         // Spaces in quote escaped field
162 |         "A,B,C\r\n"
163 |         "\t\t   123,\"   234\n,345  \t\",    456\r\n",
164 | 
165 |         // Spaces in one header column
166 |         "A,B,        C\r\n"
167 |         "123,\"234\n,345\",456\r\n",
168 | 
169 |         // Random spaces + tabs in header
170 |         "\t A,  B\t,     C\r\n"
171 |         "123,\"234\n,345\",456\r\n",
172 | 
173 |         // Random spaces in header + data
174 |         "A,B,        C\r\n"
175 |         "123,\"234\n,345\",  456\r\n"
176 |     );
177 | 
178 |     SECTION("Parse Test") {
179 |         using namespace std;
180 | 
181 |         RowCollectionTest rows;
182 | 
183 |         auto csv = std::stringstream(row_str);
184 |         StreamParser<std::stringstream> parser(
185 |             csv,
186 |             internals::make_parse_flags(',', '"'),
187 |             internals::make_ws_flags({ ' ', '\t' })
188 |         );
189 | 
190 |         parser.set_output(rows);
191 |         parser.next();
192 | 
193 |         auto header = rows[0];
194 |         REQUIRE(vector<string>(header) == vector<string>(
195 |             { "A", "B", "C" }));
196 | 
197 |         auto row = rows[1];
198 |         REQUIRE(vector<string>(row) ==
199 |             vector<string>({ "123", "234\n,345", "456" }));
200 |         REQUIRE(row[0] == "123");
201 |         REQUIRE(row[1] == "234\n,345");
202 |         REQUIRE(row[2] == "456");
203 |     }
204 | }
205 | 
206 | TEST_CASE("Test Parser Whitespace Trimming w/ Empty Fields", "[test_raw_ws_trim]") {
207 |     auto csv_string = GENERATE(from_range(make_whitespace_test_cases()));
208 | 
209 |     SECTION("Parse Test") {
210 |         RowCollectionTest rows;
211 | 
212 |         auto csv = std::stringstream(csv_string);
213 |         StreamParser<std::stringstream> parser(
214 |             csv,
215 |             internals::make_parse_flags(',', '"'),
216 |             internals::make_ws_flags({ ' ', '\t' })
217 |         );
218 | 
219 |         parser.set_output(rows);
220 | 
221 |         parser.next();
222 | 
223 |         size_t row_no = 0;
224 |         for (auto& row : rows) {
225 |             switch (row_no) {
226 |             case 0:
227 |                 REQUIRE(row[0].get<uint32_t>() == 1);
228 |                 REQUIRE(row[1].get<std::string>() == "two");
229 |                 REQUIRE(row[2].get<uint32_t>() == 3);
230 |                 break;
231 | 
232 |             case 1:
233 |                 REQUIRE(row[0].get<uint32_t>() == 4);
234 |                 REQUIRE(row[1].is_null());
235 |                 REQUIRE(row[2].get<uint32_t>() == 5);
236 |                 break;
237 | 
238 |             case 2:
239 |                 REQUIRE(row[0].is_null());
240 |                 REQUIRE(row[1].get<uint32_t>() == 6);
241 |                 REQUIRE(row[2].is_null());
242 |                 break;
243 | 
244 |             case 3:
245 |                 REQUIRE(row[0].get<uint32_t>() == 7);
246 |                 REQUIRE(row[1].get<uint32_t>() == 8);
247 |                 REQUIRE(row[2].get<uint32_t>() == 9);
248 |                 break;
249 |             }
250 | 
251 |             row_no++;
252 |         }
253 |     }
254 | }
255 | 


--------------------------------------------------------------------------------
/include/internal/common.hpp:
--------------------------------------------------------------------------------
  1 | /** @file
  2 |  *  A standalone header file containing shared code
  3 |  */
  4 | 
  5 | #pragma once
  6 | #include <algorithm>
  7 | #include <array>
  8 | #include <cmath>
  9 | #include <cstdlib>
 10 | #include <deque>
 11 | 
 12 | #if defined(_WIN32)
 13 | # ifndef WIN32_LEAN_AND_MEAN
 14 | #  define WIN32_LEAN_AND_MEAN
 15 | # endif
 16 | # include <windows.h>
 17 | # undef max
 18 | # undef min
 19 | #elif defined(__linux__)
 20 | # include <unistd.h>
 21 | #endif
 22 | 
 23 |  /** Helper macro which should be #defined as "inline"
 24 |   *  in the single header version
 25 |   */
 26 | #define CSV_INLINE
 27 | 
 28 | #pragma once
 29 | #include <type_traits>
 30 | 
 31 | #include "../external/string_view.hpp"
 32 | 
 33 |   // If there is another version of Hedley, then the newer one 
 34 |   // takes precedence.
 35 |   // See: https://github.com/nemequ/hedley
 36 | #include "../external/hedley.h"
 37 | 
 38 | namespace csv {
 39 | #ifdef _MSC_VER
 40 | #pragma region Compatibility Macros
 41 | #endif
 42 |     /**
 43 |      *  @def IF_CONSTEXPR
 44 |      *  Expands to `if constexpr` in C++17 and `if` otherwise
 45 |      *
 46 |      *  @def CONSTEXPR_VALUE
 47 |      *  Expands to `constexpr` in C++17 and `const` otherwise.
 48 |      *  Mainly used for global variables.
 49 |      *
 50 |      *  @def CONSTEXPR
 51 |      *  Expands to `constexpr` in decent compilers and `inline` otherwise.
 52 |      *  Intended for functions and methods.
 53 |      */
 54 | 
 55 | #define STATIC_ASSERT(x) static_assert(x, "Assertion failed")
 56 | 
 57 | #if (defined(CMAKE_CXX_STANDARD) && CMAKE_CXX_STANDARD == 20) || __cplusplus >= 202002L
 58 | #define CSV_HAS_CXX20
 59 | #endif
 60 | 
 61 | #if (defined(CMAKE_CXX_STANDARD) && CMAKE_CXX_STANDARD == 17) || __cplusplus >= 201703L
 62 | #define CSV_HAS_CXX17
 63 | #endif
 64 | 
 65 | #if (defined(CMAKE_CXX_STANDARD) && CMAKE_CXX_STANDARD >= 14) || __cplusplus >= 201402L
 66 | #define CSV_HAS_CXX14
 67 | #endif
 68 | 
 69 | #ifdef CSV_HAS_CXX17
 70 | #include <string_view>
 71 |      /** @typedef string_view
 72 |       *  The string_view class used by this library.
 73 |       */
 74 |     using string_view = std::string_view;
 75 | #else
 76 |      /** @typedef string_view
 77 |       *  The string_view class used by this library.
 78 |       */
 79 |     using string_view = nonstd::string_view;
 80 | #endif
 81 | 
 82 | #ifdef CSV_HAS_CXX17
 83 |     #define IF_CONSTEXPR if constexpr
 84 |     #define CONSTEXPR_VALUE constexpr
 85 | 
 86 |     #define CONSTEXPR_17 constexpr
 87 | #else
 88 |     #define IF_CONSTEXPR if
 89 |     #define CONSTEXPR_VALUE const
 90 | 
 91 |     #define CONSTEXPR_17 inline
 92 | #endif
 93 | 
 94 | #ifdef CSV_HAS_CXX14
 95 |     template<bool B, class T = void>
 96 |     using enable_if_t = std::enable_if_t<B, T>;
 97 | 
 98 |     #define CONSTEXPR_14 constexpr
 99 |     #define CONSTEXPR_VALUE_14 constexpr
100 | #else
101 |     template<bool B, class T = void>
102 |     using enable_if_t = typename std::enable_if<B, T>::type;
103 | 
104 |     #define CONSTEXPR_14 inline
105 |     #define CONSTEXPR_VALUE_14 const
106 | #endif
107 | 
108 |     // Resolves g++ bug with regard to constexpr methods
109 |     // See: https://stackoverflow.com/questions/36489369/constexpr-non-static-member-function-with-non-constexpr-constructor-gcc-clang-d
110 | #if defined __GNUC__ && !defined __clang__
111 |     #if (__GNUC__ >= 7 &&__GNUC_MINOR__ >= 2) || (__GNUC__ >= 8)
112 |         #define CONSTEXPR constexpr
113 |     #endif
114 |     #else
115 |         #ifdef CSV_HAS_CXX17
116 |         #define CONSTEXPR constexpr
117 |     #endif
118 | #endif
119 | 
120 | #ifndef CONSTEXPR
121 | #define CONSTEXPR inline
122 | #endif
123 | 
124 | #ifdef _MSC_VER
125 | #pragma endregion
126 | #endif
127 | 
128 |     namespace internals {
129 |         // PAGE_SIZE macro could be already defined by the host system.
130 | #if defined(PAGE_SIZE)
131 | #undef PAGE_SIZE
132 | #endif
133 | 
134 | // Get operating system specific details
135 | #if defined(_WIN32)
136 |         inline int getpagesize() {
137 |             _SYSTEM_INFO sys_info = {};
138 |             GetSystemInfo(&sys_info);
139 |             return std::max(sys_info.dwPageSize, sys_info.dwAllocationGranularity);
140 |         }
141 | 
142 |         const int PAGE_SIZE = getpagesize();
143 | #elif defined(__linux__) 
144 |         const int PAGE_SIZE = getpagesize();
145 | #else
146 |         /** Size of a memory page in bytes. Used by
147 |          *  csv::internals::CSVFieldArray when allocating blocks.
148 |          */
149 |         const int PAGE_SIZE = 4096;
150 | #endif
151 | 
152 |         /** For functions that lazy load a large CSV, this determines how
153 |          *  many bytes are read at a time
154 |          */
155 |         constexpr size_t ITERATION_CHUNK_SIZE = 10000000; // 10MB
156 | 
157 |         template<typename T>
158 |         inline bool is_equal(T a, T b, T epsilon = 0.001) {
159 |             /** Returns true if two floating point values are about the same */
160 |             static_assert(std::is_floating_point<T>::value, "T must be a floating point type.");
161 |             return std::abs(a - b) < epsilon;
162 |         }
163 | 
164 |         /**  @typedef ParseFlags
165 |          *   An enum used for describing the significance of each character
166 |          *   with respect to CSV parsing
167 |          *
168 |          *   @see quote_escape_flag
169 |          */
170 |         enum class ParseFlags {
171 |             QUOTE_ESCAPE_QUOTE = 0, /**< A quote inside or terminating a quote_escaped field */
172 |             QUOTE = 2 | 1,          /**< Characters which may signify a quote escape */
173 |             NOT_SPECIAL = 4,        /**< Characters with no special meaning or escaped delimiters and newlines */
174 |             DELIMITER = 4 | 2,      /**< Characters which signify a new field */
175 |             NEWLINE = 4 | 2 | 1     /**< Characters which signify a new row */
176 |         };
177 | 
178 |         /** Transform the ParseFlags given the context of whether or not the current
179 |          *  field is quote escaped */
180 |         constexpr ParseFlags quote_escape_flag(ParseFlags flag, bool quote_escape) noexcept {
181 |             return (ParseFlags)((int)flag & ~((int)ParseFlags::QUOTE * quote_escape));
182 |         }
183 | 
184 |         // Assumed to be true by parsing functions: allows for testing
185 |         // if an item is DELIMITER or NEWLINE with a >= statement
186 |         STATIC_ASSERT(ParseFlags::DELIMITER < ParseFlags::NEWLINE);
187 | 
188 |         /** Optimizations for reducing branching in parsing loop
189 |          *
190 |          *  Idea: The meaning of all non-quote characters changes depending
191 |          *  on whether or not the parser is in a quote-escaped mode (0 or 1)
192 |          */
193 |         STATIC_ASSERT(quote_escape_flag(ParseFlags::NOT_SPECIAL, false) == ParseFlags::NOT_SPECIAL);
194 |         STATIC_ASSERT(quote_escape_flag(ParseFlags::QUOTE, false) == ParseFlags::QUOTE);
195 |         STATIC_ASSERT(quote_escape_flag(ParseFlags::DELIMITER, false) == ParseFlags::DELIMITER);
196 |         STATIC_ASSERT(quote_escape_flag(ParseFlags::NEWLINE, false) == ParseFlags::NEWLINE);
197 | 
198 |         STATIC_ASSERT(quote_escape_flag(ParseFlags::NOT_SPECIAL, true) == ParseFlags::NOT_SPECIAL);
199 |         STATIC_ASSERT(quote_escape_flag(ParseFlags::QUOTE, true) == ParseFlags::QUOTE_ESCAPE_QUOTE);
200 |         STATIC_ASSERT(quote_escape_flag(ParseFlags::DELIMITER, true) == ParseFlags::NOT_SPECIAL);
201 |         STATIC_ASSERT(quote_escape_flag(ParseFlags::NEWLINE, true) == ParseFlags::NOT_SPECIAL);
202 | 
203 |         /** An array which maps ASCII chars to a parsing flag */
204 |         using ParseFlagMap = std::array<ParseFlags, 256>;
205 | 
206 |         /** An array which maps ASCII chars to a flag indicating if it is whitespace */
207 |         using WhitespaceMap = std::array<bool, 256>;
208 |     }
209 | 
210 |     /** Integer indicating a requested column wasn't found. */
211 |     constexpr int CSV_NOT_FOUND = -1;
212 | 
213 |     /** Offset to convert char into array index. */
214 |     constexpr unsigned CHAR_OFFSET = std::numeric_limits<char>::is_signed ? 128 : 0;
215 | }
216 | 


--------------------------------------------------------------------------------
/single_header.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | import os
  3 | import re
  4 | 
  5 | CPP_SEP = '/'
  6 | Include = namedtuple('Include', ['path', 'line_no'])
  7 | 
  8 | ''' Represents a file path '''
  9 | class Path(list):
 10 |     def __init__(self, *args):
 11 |         super().__init__()
 12 | 
 13 |         if (len(args) > 0 and type(args[0]) is list):
 14 |             for p in args[0]:
 15 |                 self.append(p)
 16 |         else:
 17 |             for p in args:
 18 |                 self.append(p)
 19 | 
 20 |     def append(self, sub: str):
 21 |         separated = sub.split(os.sep)
 22 |         if (len(separated) == 1):
 23 |             separated = sub.split(CPP_SEP)
 24 | 
 25 |         for i in separated:
 26 |             if (i == '..'):
 27 |                 # Go up a path
 28 |                 self.pop()
 29 |             else:
 30 |                 super().append(i)
 31 | 
 32 |     def copy(self):
 33 |         temp = Path()
 34 |         for i in self:
 35 |             temp.append(i)
 36 |         return temp
 37 | 
 38 |     def join(self, sub: str):
 39 |         temp = self.copy()
 40 |         temp.append(sub)
 41 |         return temp
 42 | 
 43 |     ''' Return the first element of the path '''
 44 |     def dirname(self) -> str:
 45 |         try:
 46 |             return self[0]
 47 |         except IndexError:
 48 |             return ''
 49 | 
 50 |     def ext(self) -> str:
 51 |         try:
 52 |             return self[-1].split('.')[-1]
 53 |         except IndexError:
 54 |             return ''
 55 | 
 56 |     def __str__(self):
 57 |         if (len(self) == 1):
 58 |             return self[0] + '/'
 59 | 
 60 |         return '/'.join(self)
 61 | 
 62 |     def __hash__(self):
 63 |         return hash(str(self))
 64 | 
 65 | def header_list(files: list) -> list:
 66 |     '''
 67 |     Given a list of files, compute the list of header files in the order in which they should
 68 |     be included to avoid conflicts
 69 |     '''
 70 | 
 71 |     dependencies = {}
 72 |     headers = []
 73 | 
 74 |     ''' Iterate over every .cpp and .hpp file '''
 75 |     for file in files:
 76 |         file_ext = file.ext()
 77 |         if (file_ext == 'hpp' or file_ext == 'h'):
 78 |             dependencies[file] = [d.path for d in get_dependencies(file)['local']]
 79 | 
 80 |     while dependencies:
 81 |         for file in list(dependencies.keys()):
 82 |             # Remove includes we've already included
 83 |             dependencies[file] = [i for i in dependencies[file] if i not in headers]
 84 | 
 85 |             # If no more dependencies, add file
 86 |             if not dependencies[file]:
 87 |                 headers.append(file)
 88 |                 dependencies.pop(file)
 89 | 
 90 |     return headers
 91 | 
 92 | def get_dependencies(file: Path) -> dict:
 93 |     ''' Parse a .cpp/.hpp file for its system and local dependencies '''
 94 | 
 95 |     dir = Path(file[:-1])
 96 | 
 97 |     headers = {
 98 |         "system": [],
 99 |         "local": []
100 |     }
101 | 
102 |     with open(str(file), mode='r') as infile:
103 |         for i, line in enumerate(infile):
104 |             sys_include = re.search('^#include <(?P<file>.*)>', line)
105 |             local_include = re.search('^#include "(?P<file>.*)"', line)
106 |             if sys_include:
107 |                 headers["system"].append(
108 |                     Include(path=sys_include.group('file'), line_no=i))
109 |             elif local_include:
110 |                 headers["local"].append(
111 |                     Include(path=dir.join(local_include.group('file')), line_no=i))
112 | 
113 |     return headers
114 | 
115 | ''' Strip local include statements and #pragma once declarations from source files '''
116 | def file_strip(file: Path) -> str:
117 |     new_file = ''
118 |     strip_these = ['#include "(?P<file>.*)"', '#pragma once' ]
119 | 
120 |     # Strip out pragma once
121 |     with open(str(file), mode='r') as infile:
122 |         for line in infile:
123 |             add_this_line = sum(re.search(strip, line) is not None for strip in strip_these) == 0
124 | 
125 |             # Change "#define CSV_INLINE" to "#define CSV_INLINE inline"
126 |             if ('#define CSV_INLINE' in line):
127 |                 line = "#define CSV_INLINE inline\n"
128 | 
129 |             if (add_this_line):
130 |                 new_file += line
131 | 
132 |     return new_file
133 | 
134 | '''
135 | Collate header files by using this following algorithm:
136 | 
137 | - Given a list of header files (HEADERS) ordered such that the first file
138 |     has no internal dependencies, and the last file is the most dependent
139 |     - Reverse the list
140 | - Maintain these data structures:
141 |     - A set of header files (PROCESSED) that were processed
142 |     - A set of header files (MISSING_INCLUDES) that we are looking for
143 |     - The collation of header source code (HEADER_CONCAT)
144 | - Go through each FILE in list of headers in reverse order (starting with
145 |   the headers at the highest level of the dependency tree)
146 |     - If FILE is not in MISSING_INCLUDES, then concatenate source verbatim to HEADER_CONCAT
147 |     - Otherwise, there is one or more #include statements in HEADER_CONCAT which references FILE 
148 |         - Replace the first #include statement with the source of FILE, and remove the rest
149 | '''
150 | def header_collate(headers: list):
151 |     headers.reverse()
152 | 
153 |     # Placeholder for includes to be inserted
154 |     splice_template = "__INSERT_HEADER_HERE__({})\n"
155 |     header_concat = ''
156 |     processed = set()
157 |     missing_includes = set()
158 | 
159 |     def process_file(path: Path):
160 |         source = ''
161 | 
162 |         with open(str(path), mode='r') as infile:
163 |             for line in infile:
164 |                 # Add local includes to MISSING_INCLUDES
165 |                 local_include = re.search('^#include "(?P<file>.*)"', line)
166 |                 if local_include:
167 |                     dir = Path(path[:-1])
168 |                     include_path = dir.join(local_include.group('file'))
169 | 
170 |                     if str(include_path) not in processed:
171 |                         missing_includes.add(str(include_path))
172 |                         source += splice_template.format(str(include_path))
173 |                 elif '#pragma once' in line:
174 |                     continue
175 |                 else:
176 |                     source += line
177 | 
178 |         return source
179 | 
180 |     for path in headers:
181 |         processed.add(str(path))
182 | 
183 |         if str(path) in missing_includes:
184 |             source = process_file(path)
185 |             splice_phrase = splice_template.format(str(path))
186 |             header_concat = header_concat.replace(
187 |                 splice_phrase,
188 |                 source + '\n', 1)
189 |             header_concat = header_concat.replace(splice_phrase, '')
190 | 
191 |             missing_includes.remove(str(path))
192 |         else:
193 |             header_concat += process_file(path)
194 | 
195 |     return header_concat
196 | 
197 | if __name__ == "__main__":
198 |     ''' Iterate over every .cpp and .hpp file '''
199 |     headers = []
200 |     sources = []
201 |     system_includes = set()
202 | 
203 |     # Generate a list of header and source file locations
204 |     for dir in os.walk('include'):
205 |         files = dir[2]
206 | 
207 |         for file in files:
208 |             fname = Path(dir[0], file)
209 | 
210 |             if (file[-4:] == '.hpp' or file[-2:] == '.h'):
211 |                 headers.append(fname)
212 |             elif (file[-4:] == '.cpp'):
213 |                 sources.append(fname)
214 | 
215 |     # Rearrange header order to avoid compilation conflicts
216 |     headers = header_list(sorted(headers))
217 | 
218 |     # Get system includes
219 |     for file in sources + headers:
220 |         for include in get_dependencies(file)['system']:
221 |             system_includes.add(include.path)
222 | 
223 |     # Collate header and source files
224 |     header_concat = header_collate(headers)
225 |     source_collate = ''
226 | 
227 |     for cpp in sources:
228 |         source_collate += file_strip(cpp) + '\n'
229 |     
230 |     # Generate hpp file
231 |     print("#pragma once")
232 |     print(header_concat.replace(
233 |         "#define CSV_INLINE", "#define CSV_INLINE inline").replace(
234 |             "/** INSERT_CSV_SOURCES **/", source_collate))


--------------------------------------------------------------------------------
/python/csvpy.cpp:
--------------------------------------------------------------------------------
  1 | #include <pybind11/pybind11.h>
  2 | #include <pybind11/stl.h>
  3 | #include <pybind11/operators.h>
  4 | #include <utility>
  5 | #include <vector>
  6 | #include <algorithm>
  7 | #include "csv.hpp"
  8 | namespace py = pybind11;
  9 | using namespace pybind11::literals;
 10 | using namespace csv;
 11 | 
 12 | void init_CSVFormat(py::module& m){
 13 |     py::class_<CSVFormat>(m, "Format")
 14 |     .def(py::init<>())
 15 |     .def("delimiter",
 16 |              py::overload_cast<const std::vector<char>&>(&CSVFormat::delimiter),
 17 |              "Sets a list of potential delimiters.",
 18 |              py::arg("delim"))
 19 |     .def("delimiter",
 20 |              py::overload_cast<char>(&CSVFormat::delimiter),
 21 |              "Sets the delimiter of the CSV file.",
 22 |              py::arg("delim"))
 23 | 
 24 |     .def("trim", 
 25 |         &CSVFormat::trim, 
 26 |         "Sets the whitespace characters to be trimmed",
 27 |         py::arg("ws"))
 28 | 
 29 |     .def("quote", 
 30 |         py::overload_cast<char>(&CSVFormat::quote),
 31 |         "Sets the quote character",
 32 |         py::arg("quote"))
 33 | 
 34 |     .def("quote", 
 35 |         py::overload_cast<bool>(&CSVFormat::quote),
 36 |         "Turn quoting on or off",
 37 |         py::arg("use_quote"))
 38 | 
 39 |     .def("column_names", 
 40 |         &CSVFormat::column_names,
 41 |         "Sets the column names.",
 42 |         py::arg("names"))
 43 | 
 44 |     .def("header_row", 
 45 |         &CSVFormat::header_row,
 46 |         "Sets the header row",
 47 |         py::arg("row"))
 48 |     .def("no_header", 
 49 |         &CSVFormat::no_header,
 50 |         "Tells the parser that this CSV has no header row")
 51 |     .def("is_quoting_enabled",
 52 |     &CSVFormat::is_quoting_enabled)
 53 |     .def("get_quote_char",
 54 |     &CSVFormat::get_quote_char)
 55 |     .def("get_header", &CSVFormat::get_header)
 56 |     .def("get_possible_delims",
 57 |     &CSVFormat::get_possible_delims)
 58 |     .def("get_trim_chars",
 59 |     &CSVFormat::get_trim_chars);
 60 | }
 61 | 
 62 | void init_CSVReader(py::module& m){
 63 |     py::class_<CSVReader>(m, "Reader")
 64 |     .def(py::init<csv::string_view, CSVFormat>(), 
 65 |     "filename"_a, 
 66 |     "format"_a=CSVFormat::guess_csv())
 67 |     .def("eof", 
 68 |     &CSVReader::eof,
 69 |     "Returns true if we have reached end of file")
 70 |     .def("get_format", 
 71 |     &CSVReader::get_format)
 72 |     .def("empty", 
 73 |     &CSVReader::empty)
 74 |     .def("n_rows", 
 75 |     &CSVReader::n_rows,
 76 |     "Retrieves the number of rows that have been read so far")
 77 |     .def("utf8_bom", 
 78 |     &CSVReader::utf8_bom,
 79 |     "Whether or not CSV was prefixed with a UTF-8 bom")
 80 |     .def("__iter__", 
 81 |     [](CSVReader& reader){return py::make_iterator(reader.begin(), reader.end());},
 82 |     py::keep_alive<0, 1>());
 83 | }
 84 | 
 85 | void init_CSVRow(py::module& m){
 86 |     py::class_<CSVRow>(m, "Row")
 87 |     .def(py::init<>())
 88 |     .def("empty", 
 89 |     &CSVRow::empty, 
 90 |     "Indicates whether row is empty or not")
 91 |     .def("size", 
 92 |     &CSVRow::size,
 93 |     "Return the number of fields in this row")
 94 |     
 95 |     .def("get_col_names", 
 96 |     &CSVRow::get_col_names,
 97 |     "Retrieve this row's associated column names")
 98 | 
 99 |     .def("to_json", &CSVRow::to_json, "subset"_a=std::vector<std::string>{})
100 | 
101 |     .def("to_json_array", &CSVRow::to_json_array, "subset"_a=std::vector<std::string>{})
102 | 
103 |     .def("__getitem__", [](const CSVRow& row, size_t idx){
104 |         if(idx >= row.size()){
105 |             throw py::index_error("index out of range");
106 |         }
107 |         return row[idx];
108 |     }, py::is_operator())
109 | 
110 |     .def("__getitem__", [](const CSVRow& row, std::string col_name){
111 |         auto column_names = row.get_col_names();
112 |         auto it = std::find(column_names.begin(), column_names.end(), col_name);
113 |         if (it != column_names.end()){
114 |             return row[it - column_names.begin()];
115 |         }else{
116 |             throw py::index_error("Can't find a column named " + col_name);
117 |         }
118 |     }, py::is_operator());
119 | }
120 | 
121 | void init_DataType(py::module& m){
122 |     py::enum_<DataType>(m,
123 |     "DataType", 
124 |     py::arithmetic(),
125 |     "Enumerates the different CSV field types that are recognized by this library")
126 |     .value("UNKNOWN" ,DataType::UNKNOWN)
127 |     .value("CSV_NULL", DataType::CSV_NULL, "Empty string")
128 |     .value("CSV_STRING", DataType::CSV_STRING, "Non-numeric string")
129 |     .value("CSV_INT8", DataType::CSV_INT8, "8-bit integer")
130 |     .value("CSV_INT16", DataType::CSV_INT16, "16-bit integer")
131 |     .value("CSV_INT32", DataType::CSV_INT32, "32-bit integer")
132 |     .value("CSV_INT64", DataType::CSV_INT64, "64-bit integer")
133 |     .value("CSV_DOUBLE", DataType::CSV_DOUBLE, "Floating point value");
134 | }
135 | 
136 | void init_CSVField(py::module& m){
137 |     py::class_<CSVField>(m, "Field")
138 |     .def(py::init<csv::string_view>())
139 |     .def("is_null", 
140 |     &CSVField::is_null, 
141 |     "Returns true if field is an empty string or string of whitespace characters")
142 |     .def("get_sv", 
143 |     &CSVField::get_sv,
144 |     "Return a string view over the field's contents")
145 |     .def("is_str", 
146 |     &CSVField::is_str,
147 |     "Returns true if field is a non-numeric, non-empty string")
148 |     .def("is_num", 
149 |     &CSVField::is_num,
150 |     "Returns true if field is an integer or float")
151 |     .def("is_int", 
152 |     &CSVField::is_int,
153 |     "Returns true if field is an integer")
154 |     .def("is_float", 
155 |     &CSVField::is_float,
156 |     "Returns true if field is a floating point value")
157 |     .def("type",
158 |     &CSVField::type,
159 |     "Return the type of the underlying CSV data")
160 |     .def("get_int", &CSVField::get<int64_t>)
161 |     .def("get_str", &CSVField::get<std::string>)
162 |     .def("get_double", &CSVField::get<double>)
163 |     .def("get_float", &CSVField::get<float>);
164 | }
165 | 
166 | void init_CSVUtility(py::module& m){
167 |     py::class_<CSVFileInfo>(m, "CSVFileInfo")
168 |     .def_readonly("filename",&CSVFileInfo::filename)
169 |     .def_readonly("col_names", &CSVFileInfo::col_names)
170 |     .def_readonly("delim", &CSVFileInfo::delim)
171 |     .def_readonly("n_rows", &CSVFileInfo::n_rows)
172 |     .def_readonly("n_cols", &CSVFileInfo::n_cols);
173 |     
174 |     m.def("parse", 
175 |     &parse,
176 |     "Shorthand function for parsing an in-memory CSV string",
177 |     py::arg("in"), py::arg("format"))
178 |     .def("parse_no_header",
179 |     &parse_no_header,
180 |     "Parses a CSV string with no headers",
181 |     py::arg("in"))
182 |     .def("get_col_pos",
183 |     &get_col_pos,
184 |     "Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise",
185 |     py::arg("filename"),
186 |     py::arg("col_name"),
187 |     py::arg("format"))
188 |     .def("get_file_info",
189 |     &get_file_info,
190 |     "Get basic information about a CSV file",
191 |     py::arg("filename"))
192 |     .def("csv_data_types",
193 |     &csv_data_types,
194 |     "Return a data type for each column such that every value in a column can be converted to the corresponding data type without data loss.",
195 |     py::arg("filename"));
196 | }
197 | 
198 | void init_CSVStat(py::module& m){
199 |     py::class_<CSVStat>(m, "CSVStat")
200 |     .def(py::init<csv::string_view, CSVFormat>(),
201 |     "filename"_a,
202 |     "format"_a=CSVFormat::guess_csv())
203 |     .def("get_mean",
204 |     &CSVStat::get_mean,
205 |     "Return current means")
206 |     .def("get_variance",
207 |     &CSVStat::get_variance,
208 |     "Return current variances")
209 |     .def("get_mins",
210 |     &CSVStat::get_mins,
211 |     "Return current mins")
212 |     .def("get_maxes",
213 |     &CSVStat::get_maxes,
214 |     "Return current maxes")
215 |     .def("get_counts",
216 |     &CSVStat::get_counts,
217 |     "Get counts for each column")
218 |     .def("get_dtypes",
219 |     &CSVStat::get_dtypes,
220 |     "Get data type counts for each column")
221 |     .def("get_col_names",
222 |     &CSVStat::get_col_names,
223 |     "Return the CSV's column names as a List of strings.");
224 | }
225 | 
226 | PYBIND11_MODULE(csvpy, m){
227 |     m.doc() = "A modern C++ library for reading, writing, and analyzing CSV (and similar) files.";
228 |     init_CSVFormat(m);
229 |     init_CSVReader(m);
230 |     init_CSVRow(m);
231 |     init_DataType(m);
232 |     init_CSVField(m);
233 |     init_CSVUtility(m);
234 |     init_CSVStat(m);
235 | }


--------------------------------------------------------------------------------
/include/internal/csv_row_json.cpp:
--------------------------------------------------------------------------------
  1 | /** @file
  2 |  *  Implements JSON serialization abilities
  3 |  */
  4 | 
  5 | #include "csv_row.hpp"
  6 | 
  7 | namespace csv {
  8 |     /*
  9 |     The implementations for json_extra_space() and json_escape_string()
 10 |     were modified from source code for JSON for Modern C++.
 11 | 
 12 |     The respective license is below:
 13 | 
 14 |     The code is licensed under the [MIT
 15 |     License](http://opensource.org/licenses/MIT):
 16 |     
 17 |     Copyright &copy; 2013-2015 Niels Lohmann.
 18 |     
 19 |     Permission is hereby granted, free of charge, to any person
 20 |     obtaining a copy of this software and associated documentation files
 21 |     (the "Software"), to deal in the Software without restriction,
 22 |     including without limitation the rights to use, copy, modify, merge,
 23 |     publish, distribute, sublicense, and/or sell copies of the Software,
 24 |     and to permit persons to whom the Software is furnished to do so,
 25 |     subject to the following conditions:
 26 |     
 27 |     The above copyright notice and this permission notice shall be
 28 |     included in all copies or substantial portions of the Software.
 29 |     
 30 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 31 |     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 32 |     MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 33 |     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 34 |     BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 35 |     ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 36 |     CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 37 |     SOFTWARE.
 38 |     */
 39 | 
 40 |     namespace internals {
 41 |         /*!
 42 |          @brief calculates the extra space to escape a JSON string
 43 | 
 44 |          @param[in] s  the string to escape
 45 |          @return the number of characters required to escape string @a s
 46 | 
 47 |          @complexity Linear in the length of string @a s.
 48 |         */
 49 |         static std::size_t json_extra_space(csv::string_view& s) noexcept
 50 |         {
 51 |             std::size_t result = 0;
 52 | 
 53 | 
 54 |             for (const auto& c : s)
 55 |             {
 56 |                 switch (c)
 57 |                 {
 58 |                 case '"':
 59 |                 case '\\':
 60 |                 case '\b':
 61 |                 case '\f':
 62 |                 case '\n':
 63 |                 case '\r':
 64 |                 case '\t':
 65 |                 {
 66 |                     // from c (1 byte) to \x (2 bytes)
 67 |                     result += 1;
 68 |                     break;
 69 |                 }
 70 | 
 71 | 
 72 |                 default:
 73 |                 {
 74 |                     if (c >= 0x00 && c <= 0x1f)
 75 |                     {
 76 |                         // from c (1 byte) to \uxxxx (6 bytes)
 77 |                         result += 5;
 78 |                     }
 79 |                     break;
 80 |                 }
 81 |                 }
 82 |             }
 83 | 
 84 | 
 85 |             return result;
 86 |         }
 87 | 
 88 |         CSV_INLINE std::string json_escape_string(csv::string_view s) noexcept
 89 |         {
 90 |             const auto space = json_extra_space(s);
 91 |             if (space == 0)
 92 |             {
 93 |                 return std::string(s);
 94 |             }
 95 | 
 96 |             // create a result string of necessary size
 97 |             size_t result_size = s.size() + space;
 98 |             std::string result(result_size, '\\');
 99 |             std::size_t pos = 0;
100 | 
101 |             for (const auto& c : s)
102 |             {
103 |                 switch (c)
104 |                 {
105 |                 // quotation mark (0x22)
106 |                 case '"':
107 |                 {
108 |                     result[pos + 1] = '"';
109 |                     pos += 2;
110 |                     break;
111 |                 }
112 | 
113 | 
114 |                 // reverse solidus (0x5c)
115 |                 case '\\':
116 |                 {
117 |                     // nothing to change
118 |                     pos += 2;
119 |                     break;
120 |                 }
121 | 
122 | 
123 |                 // backspace (0x08)
124 |                 case '\b':
125 |                 {
126 |                     result[pos + 1] = 'b';
127 |                     pos += 2;
128 |                     break;
129 |                 }
130 | 
131 | 
132 |                 // formfeed (0x0c)
133 |                 case '\f':
134 |                 {
135 |                     result[pos + 1] = 'f';
136 |                     pos += 2;
137 |                     break;
138 |                 }
139 | 
140 | 
141 |                 // newline (0x0a)
142 |                 case '\n':
143 |                 {
144 |                     result[pos + 1] = 'n';
145 |                     pos += 2;
146 |                     break;
147 |                 }
148 | 
149 | 
150 |                 // carriage return (0x0d)
151 |                 case '\r':
152 |                 {
153 |                     result[pos + 1] = 'r';
154 |                     pos += 2;
155 |                     break;
156 |                 }
157 | 
158 | 
159 |                 // horizontal tab (0x09)
160 |                 case '\t':
161 |                 {
162 |                     result[pos + 1] = 't';
163 |                     pos += 2;
164 |                     break;
165 |                 }
166 | 
167 | 
168 |                 default:
169 |                 {
170 |                     if (c >= 0x00 && c <= 0x1f)
171 |                     {
172 |                         // print character c as \uxxxx
173 |                         snprintf(&result[pos + 1], result_size - pos - 1, "u%04x", int(c));
174 |                         pos += 6;
175 |                         // overwrite trailing null character
176 |                         result[pos] = '\\';
177 |                     }
178 |                     else
179 |                     {
180 |                         // all other characters are added as-is
181 |                         result[pos++] = c;
182 |                     }
183 |                     break;
184 |                 }
185 |                 }
186 |             }
187 | 
188 |             return result;
189 |         }
190 |     }
191 | 
192 |     /** Convert a CSV row to a JSON object, i.e.
193 |      *  `{"col1":"value1","col2":"value2"}`
194 |      *
195 |      *  @note All strings are properly escaped. Numeric values are not quoted.
196 |      *  @param[in] subset A subset of columns to contain in the JSON.
197 |      *                    Leave empty for original columns.
198 |      */
199 |     CSV_INLINE std::string CSVRow::to_json(const std::vector<std::string>& subset) const {
200 |         std::vector<std::string> col_names = subset;
201 |         if (subset.empty()) {
202 |             col_names = this->data ? this->get_col_names() : std::vector<std::string>({});
203 |         }
204 | 
205 |         const size_t _n_cols = col_names.size();
206 |         std::string ret = "{";
207 |         
208 |         for (size_t i = 0; i < _n_cols; i++) {
209 |             auto& col = col_names[i];
210 |             auto field = this->operator[](col);
211 | 
212 |             // TODO: Possible performance enhancements by caching escaped column names
213 |             ret += '"' + internals::json_escape_string(col) + "\":";
214 | 
215 |             // Add quotes around strings but not numbers
216 |             if (field.is_num())
217 |                  ret += internals::json_escape_string(field.get<csv::string_view>());
218 |             else
219 |                 ret += '"' + internals::json_escape_string(field.get<csv::string_view>()) + '"';
220 | 
221 |             // Do not add comma after last string
222 |             if (i + 1 < _n_cols)
223 |                 ret += ',';
224 |         }
225 | 
226 |         ret += '}';
227 |         return ret;
228 |     }
229 | 
230 |     /** Convert a CSV row to a JSON array, i.e.
231 |      *  `["value1","value2",...]`
232 |      *
233 |      *  @note All strings are properly escaped. Numeric values are not quoted.
234 |      *  @param[in] subset A subset of columns to contain in the JSON.
235 |      *                    Leave empty for all columns.
236 |      */
237 |     CSV_INLINE std::string CSVRow::to_json_array(const std::vector<std::string>& subset) const {
238 |         std::vector<std::string> col_names = subset;
239 |         if (subset.empty())
240 |             col_names = this->data ? this->get_col_names() : std::vector<std::string>({});
241 | 
242 |         const size_t _n_cols = col_names.size();
243 |         std::string ret = "[";
244 | 
245 |         for (size_t i = 0; i < _n_cols; i++) {
246 |             auto field = this->operator[](col_names[i]);
247 | 
248 |             // Add quotes around strings but not numbers
249 |             if (field.is_num())
250 |                 ret += internals::json_escape_string(field.get<csv::string_view>());
251 |             else
252 |                 ret += '"' + internals::json_escape_string(field.get<csv::string_view>()) + '"';
253 | 
254 |             // Do not add comma after last string
255 |             if (i + 1 < _n_cols)
256 |                 ret += ',';
257 |         }
258 | 
259 |         ret += ']';
260 |         return ret;
261 |     }
262 | }
263 | 


--------------------------------------------------------------------------------
/include/internal/csv_reader.hpp:
--------------------------------------------------------------------------------
  1 | /** @file
  2 |  *  @brief Defines functionality needed for basic CSV parsing
  3 |  */
  4 | 
  5 | #pragma once
  6 | 
  7 | #include <algorithm>
  8 | #include <deque>
  9 | #include <fstream>
 10 | #include <iterator>
 11 | #include <memory>
 12 | #include <mutex>
 13 | #include <thread>
 14 | #include <sstream>
 15 | #include <string>
 16 | #include <vector>
 17 | 
 18 | #include "../external/mio.hpp"
 19 | #include "basic_csv_parser.hpp"
 20 | #include "common.hpp"
 21 | #include "data_type.hpp"
 22 | #include "csv_format.hpp"
 23 | 
 24 | /** The all encompassing namespace */
 25 | namespace csv {
 26 |     /** Stuff that is generally not of interest to end-users */
 27 |     namespace internals {
 28 |         std::string format_row(const std::vector<std::string>& row, csv::string_view delim = ", ");
 29 | 
 30 |         std::vector<std::string> _get_col_names( csv::string_view head, const CSVFormat format = CSVFormat::guess_csv());
 31 | 
 32 |         struct GuessScore {
 33 |             double score;
 34 |             size_t header;
 35 |         };
 36 | 
 37 |         CSV_INLINE GuessScore calculate_score(csv::string_view head, const CSVFormat& format);
 38 | 
 39 |         CSVGuessResult _guess_format(csv::string_view head, const std::vector<char>& delims = { ',', '|', '\t', ';', '^', '~' });
 40 |     }
 41 | 
 42 |     std::vector<std::string> get_col_names(
 43 |         csv::string_view filename,
 44 |         const CSVFormat format = CSVFormat::guess_csv());
 45 | 
 46 |     /** Guess the delimiter used by a delimiter-separated values file */
 47 |     CSVGuessResult guess_format(csv::string_view filename,
 48 |         const std::vector<char>& delims = { ',', '|', '\t', ';', '^', '~' });
 49 | 
 50 |     /** @class CSVReader
 51 |      *  @brief Main class for parsing CSVs from files and in-memory sources
 52 |      *
 53 |      *  All rows are compared to the column names for length consistency
 54 |      *  - By default, rows that are too short or too long are dropped
 55 |      *  - Custom behavior can be defined by overriding bad_row_handler in a subclass
 56 |      */
 57 |     class CSVReader {
 58 |     public:
 59 |         /**
 60 |          * An input iterator capable of handling large files.
 61 |          * @note Created by CSVReader::begin() and CSVReader::end().
 62 |          *
 63 |          * @par Iterating over a file
 64 |          * @snippet tests/test_csv_iterator.cpp CSVReader Iterator 1
 65 |          *
 66 |          * @par Using with `<algorithm>` library
 67 |          * @snippet tests/test_csv_iterator.cpp CSVReader Iterator 2
 68 |          */
 69 |         class iterator {
 70 |         public:
 71 |             #ifndef DOXYGEN_SHOULD_SKIP_THIS
 72 |             using value_type = CSVRow;
 73 |             using difference_type = std::ptrdiff_t;
 74 |             using pointer = CSVRow * ;
 75 |             using reference = CSVRow & ;
 76 |             using iterator_category = std::input_iterator_tag;
 77 |             #endif
 78 | 
 79 |             iterator() = default;
 80 |             iterator(CSVReader* reader) : daddy(reader) {};
 81 |             iterator(CSVReader*, CSVRow&&);
 82 | 
 83 |             /** Access the CSVRow held by the iterator */
 84 |             CONSTEXPR_14 reference operator*() { return this->row; }
 85 |             CONSTEXPR_14 reference operator*() const { return const_cast<reference>(this->row); }
 86 | 
 87 |             /** Return a pointer to the CSVRow the iterator has stopped at */
 88 |             CONSTEXPR_14 pointer operator->() { return &(this->row); }
 89 |             CONSTEXPR_14 pointer operator->() const { return const_cast<pointer>(&(this->row)); }
 90 | 
 91 |             iterator& operator++();   /**< Pre-increment iterator */
 92 |             iterator operator++(int); /**< Post-increment iterator */
 93 | 
 94 |             /** Returns true if iterators were constructed from the same CSVReader
 95 |              *  and point to the same row
 96 |              */
 97 |             CONSTEXPR bool operator==(const iterator& other) const noexcept {
 98 |                 return (this->daddy == other.daddy) && (this->i == other.i);
 99 |             }
100 | 
101 |             CONSTEXPR bool operator!=(const iterator& other) const noexcept { return !operator==(other); }
102 |         private:
103 |             CSVReader * daddy = nullptr;  // Pointer to parent
104 |             CSVRow row;                   // Current row
105 |             size_t i = 0;               // Index of current row
106 |         };
107 | 
108 |         /** @name Constructors
109 |          *  Constructors for iterating over large files and parsing in-memory sources.
110 |          */
111 |          ///@{
112 |         CSVReader(csv::string_view filename, CSVFormat format = CSVFormat::guess_csv());
113 | 
114 |         /** Allows parsing stream sources such as `std::stringstream` or `std::ifstream`
115 |          *
116 |          *  @tparam TStream An input stream deriving from `std::istream`
117 |          *  @note   Currently this constructor requires special CSV dialects to be manually
118 |          *          specified.
119 |          */
120 |         template<typename TStream,
121 |             csv::enable_if_t<std::is_base_of<std::istream, TStream>::value, int> = 0>
122 |         CSVReader(TStream &source, CSVFormat format = CSVFormat::guess_csv()) : _format(format) {
123 |             auto head = internals::get_csv_head(source);
124 |             using Parser = internals::StreamParser<TStream>;
125 | 
126 |             if (format.guess_delim()) {
127 |                 auto guess_result = internals::_guess_format(head, format.possible_delimiters);
128 |                 format.delimiter(guess_result.delim);
129 |                 format.header = guess_result.header_row;
130 |                 this->_format = format;
131 |             }
132 | 
133 |             if (!format.col_names.empty())
134 |                 this->set_col_names(format.col_names);
135 | 
136 |             this->parser = std::unique_ptr<Parser>(
137 |                 new Parser(source, format, col_names)); // For C++11
138 |             this->initial_read();
139 |         }
140 |         ///@}
141 | 
142 |         CSVReader(const CSVReader&) = delete; // No copy constructor
143 |         CSVReader(CSVReader&&) = default;     // Move constructor
144 |         CSVReader& operator=(const CSVReader&) = delete; // No copy assignment
145 |         CSVReader& operator=(CSVReader&& other) = default;
146 |         ~CSVReader() {
147 |             if (this->read_csv_worker.joinable()) {
148 |                 this->read_csv_worker.join();
149 |             }
150 |         }
151 | 
152 |         /** @name Retrieving CSV Rows */
153 |         ///@{
154 |         bool read_row(CSVRow &row);
155 |         iterator begin();
156 |         HEDLEY_CONST iterator end() const noexcept;
157 | 
158 |         /** Returns true if we have reached end of file */
159 |         bool eof() const noexcept { return this->parser->eof(); };
160 |         ///@}
161 | 
162 |         /** @name CSV Metadata */
163 |         ///@{
164 |         CSVFormat get_format() const;
165 |         std::vector<std::string> get_col_names() const;
166 |         int index_of(csv::string_view col_name) const;
167 |         ///@}
168 | 
169 |         /** @name CSV Metadata: Attributes */
170 |         ///@{
171 |         /** Whether or not the file or stream contains valid CSV rows,
172 |          *  not including the header.
173 |          *
174 |          *  @note Gives an accurate answer regardless of when it is called.
175 |          *
176 |          */
177 |         CONSTEXPR bool empty() const noexcept { return this->n_rows() == 0; }
178 | 
179 |         /** Retrieves the number of rows that have been read so far */
180 |         CONSTEXPR size_t n_rows() const noexcept { return this->_n_rows; }
181 | 
182 |         /** Whether or not CSV was prefixed with a UTF-8 bom */
183 |         bool utf8_bom() const noexcept { return this->parser->utf8_bom(); }
184 |         ///@}
185 | 
186 |     protected:
187 |         /**
188 |          * \defgroup csv_internal CSV Parser Internals
189 |          * @brief Internals of CSVReader. Only maintainers and those looking to
190 |          *        extend the parser should read this.
191 |          * @{
192 |          */
193 | 
194 |         /** Sets this reader's column names and associated data */
195 |         void set_col_names(const std::vector<std::string>&);
196 | 
197 |         /** @name CSV Settings **/
198 |         ///@{
199 |         CSVFormat _format;
200 |         ///@}
201 | 
202 |         /** @name Parser State */
203 |         ///@{
204 |         /** Pointer to a object containing column information */
205 |         internals::ColNamesPtr col_names = std::make_shared<internals::ColNames>();
206 | 
207 |         /** Helper class which actually does the parsing */
208 |         std::unique_ptr<internals::IBasicCSVParser> parser = nullptr;
209 | 
210 |         /** Queue of parsed CSV rows */
211 |         std::unique_ptr<RowCollection> records{new RowCollection(100)};
212 | 
213 |         size_t n_cols = 0;  /**< The number of columns in this CSV */
214 |         size_t _n_rows = 0; /**< How many rows (minus header) have been read so far */
215 | 
216 |         /** @name Multi-Threaded File Reading Functions */
217 |         ///@{
218 |         bool read_csv(size_t bytes = internals::ITERATION_CHUNK_SIZE);
219 |         ///@}
220 | 
221 |         /**@}*/
222 | 
223 |     private:
224 |         /** Whether or not rows before header were trimmed */
225 |         bool header_trimmed = false;
226 | 
227 |         /** @name Multi-Threaded File Reading: Flags and State */
228 |         ///@{
229 |         std::thread read_csv_worker; /**< Worker thread for read_csv() */
230 |         ///@}
231 | 
232 |         /** Read initial chunk to get metadata */
233 |         void initial_read() {
234 |             this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE);
235 |             this->read_csv_worker.join();
236 |         }
237 | 
238 |         void trim_header();
239 |     };
240 | }
241 | 


--------------------------------------------------------------------------------
/include/internal/csv_row.cpp:
--------------------------------------------------------------------------------
  1 | /** @file
  2 |  *  Defines the data type used for storing information about a CSV row
  3 |  */
  4 | 
  5 | #include <cassert>
  6 | #include <functional>
  7 | #include "csv_row.hpp"
  8 | 
  9 | namespace csv {
 10 |     namespace internals {
 11 |         CSV_INLINE RawCSVField& CSVFieldList::operator[](size_t n) const {
 12 |             const size_t page_no = n / _single_buffer_capacity;
 13 |             const size_t buffer_idx = (page_no < 1) ? n : n % _single_buffer_capacity;
 14 |             return this->buffers[page_no][buffer_idx];
 15 |         }
 16 | 
 17 |         CSV_INLINE void CSVFieldList::allocate() {
 18 |             buffers.push_back(std::unique_ptr<RawCSVField[]>(new RawCSVField[_single_buffer_capacity]));
 19 | 
 20 |             _current_buffer_size = 0;
 21 |             _back = buffers.back().get();
 22 |         }
 23 |     }
 24 | 
 25 |     /** Return a CSVField object corrsponding to the nth value in the row.
 26 |      *
 27 |      *  @note This method performs bounds checking, and will throw an
 28 |      *        `std::runtime_error` if n is invalid.
 29 |      *
 30 |      *  @complexity
 31 |      *  Constant, by calling csv::CSVRow::get_csv::string_view()
 32 |      *
 33 |      */
 34 |     CSV_INLINE CSVField CSVRow::operator[](size_t n) const {
 35 |         return CSVField(this->get_field(n));
 36 |     }
 37 | 
 38 |     /** Retrieve a value by its associated column name. If the column
 39 |      *  specified can't be round, a runtime error is thrown.
 40 |      *
 41 |      *  @complexity
 42 |      *  Constant. This calls the other CSVRow::operator[]() after
 43 |      *  converting column names into indices using a hash table.
 44 |      *
 45 |      *  @param[in] col_name The column to look for
 46 |      */
 47 |     CSV_INLINE CSVField CSVRow::operator[](const std::string& col_name) const {
 48 |         auto & col_names = this->data->col_names;
 49 |         auto col_pos = col_names->index_of(col_name);
 50 |         if (col_pos > -1) {
 51 |             return this->operator[](col_pos);
 52 |         }
 53 | 
 54 |         throw std::runtime_error("Can't find a column named " + col_name);
 55 |     }
 56 | 
 57 |     CSV_INLINE CSVRow::operator std::vector<std::string>() const {
 58 |         std::vector<std::string> ret;
 59 |         for (size_t i = 0; i < size(); i++)
 60 |             ret.push_back(std::string(this->get_field(i)));
 61 | 
 62 |         return ret;
 63 |     }
 64 | 
 65 |     CSV_INLINE csv::string_view CSVRow::get_field(size_t index) const
 66 |     {
 67 |         using internals::ParseFlags;
 68 | 
 69 |         if (index >= this->size())
 70 |             throw std::runtime_error("Index out of bounds.");
 71 | 
 72 |         const size_t field_index = this->fields_start + index;
 73 |         auto& field = this->data->fields[field_index];
 74 |         auto field_str = csv::string_view(this->data->data).substr(this->data_start + field.start);
 75 | 
 76 |         if (field.has_double_quote) {
 77 |             auto& value = this->data->double_quote_fields[field_index];
 78 |             if (value.empty()) {
 79 |                 bool prev_ch_quote = false;
 80 |                 for (size_t i = 0; i < field.length; i++) {
 81 |                     if (this->data->parse_flags[field_str[i] + CHAR_OFFSET] == ParseFlags::QUOTE) {
 82 |                         if (prev_ch_quote) {
 83 |                             prev_ch_quote = false;
 84 |                             continue;
 85 |                         }
 86 |                         else {
 87 |                             prev_ch_quote = true;
 88 |                         }
 89 |                     }
 90 | 
 91 |                     value += field_str[i];
 92 |                 }
 93 |             }
 94 | 
 95 |             return csv::string_view(value);
 96 |         }
 97 | 
 98 |         return field_str.substr(0, field.length);
 99 |     }
100 | 
101 |     CSV_INLINE bool CSVField::try_parse_hex(int& parsedValue) {
102 |         size_t start = 0, end = 0;
103 | 
104 |         // Trim out whitespace chars
105 |         for (; start < this->sv.size() && this->sv[start] == ' '; start++);
106 |         for (end = start; end < this->sv.size() && this->sv[end] != ' '; end++);
107 |         
108 |         int value_ = 0;
109 | 
110 |         size_t digits = (end - start);
111 |         size_t base16_exponent = digits - 1;
112 | 
113 |         if (digits == 0) return false;
114 | 
115 |         for (const auto& ch : this->sv.substr(start, digits)) {
116 |             int digit = 0;
117 | 
118 |             switch (ch) {
119 |             case '0':
120 |             case '1':
121 |             case '2':
122 |             case '3':
123 |             case '4':
124 |             case '5':
125 |             case '6':
126 |             case '7':
127 |             case '8':
128 |             case '9':
129 |                 digit = static_cast<int>(ch - '0');
130 |                 break;
131 |             case 'a':
132 |             case 'A':
133 |                 digit = 10;
134 |                 break;
135 |             case 'b':
136 |             case 'B':
137 |                 digit = 11;
138 |                 break;
139 |             case 'c':
140 |             case 'C':
141 |                 digit = 12;
142 |                 break;
143 |             case 'd':
144 |             case 'D':
145 |                 digit = 13;
146 |                 break;
147 |             case 'e':
148 |             case 'E':
149 |                 digit = 14;
150 |                 break;
151 |             case 'f':
152 |             case 'F':
153 |                 digit = 15;
154 |                 break;
155 |             default:
156 |                 return false;
157 |             }
158 | 
159 |             value_ += digit * (int)pow(16, (double)base16_exponent);
160 |             base16_exponent--;
161 |         }
162 | 
163 |         parsedValue = value_;
164 |         return true;
165 |     }
166 | 
167 |     CSV_INLINE bool CSVField::try_parse_decimal(long double& dVal, const char decimalSymbol) {
168 |         // If field has already been parsed to empty, no need to do it aagin:
169 |         if (this->_type == DataType::CSV_NULL)
170 |                     return false;
171 | 
172 |         // Not yet parsed or possibly parsed with other decimalSymbol
173 |         if (this->_type == DataType::UNKNOWN || this->_type == DataType::CSV_STRING || this->_type == DataType::CSV_DOUBLE)
174 |             this->_type = internals::data_type(this->sv, &this->value, decimalSymbol); // parse again
175 | 
176 |         // Integral types are not affected by decimalSymbol and need not be parsed again
177 | 
178 |         // Either we already had an integral type before, or we we just got any numeric type now.
179 |         if (this->_type >= DataType::CSV_INT8 && this->_type <= DataType::CSV_DOUBLE) {
180 |             dVal = this->value;
181 |             return true;
182 |         }
183 | 
184 |         // CSV_NULL or CSV_STRING, not numeric
185 |         return false;
186 |     }
187 | 
188 | #ifdef _MSC_VER
189 | #pragma region CSVRow Iterator
190 | #endif
191 |     /** Return an iterator pointing to the first field. */
192 |     CSV_INLINE CSVRow::iterator CSVRow::begin() const {
193 |         return CSVRow::iterator(this, 0);
194 |     }
195 | 
196 |     /** Return an iterator pointing to just after the end of the CSVRow.
197 |      *
198 |      *  @warning Attempting to dereference the end iterator results
199 |      *           in dereferencing a null pointer.
200 |      */
201 |     CSV_INLINE CSVRow::iterator CSVRow::end() const noexcept {
202 |         return CSVRow::iterator(this, (int)this->size());
203 |     }
204 | 
205 |     CSV_INLINE CSVRow::reverse_iterator CSVRow::rbegin() const noexcept {
206 |         return std::reverse_iterator<CSVRow::iterator>(this->end());
207 |     }
208 | 
209 |     CSV_INLINE CSVRow::reverse_iterator CSVRow::rend() const {
210 |         return std::reverse_iterator<CSVRow::iterator>(this->begin());
211 |     }
212 | 
213 |     CSV_INLINE HEDLEY_NON_NULL(2)
214 |     CSVRow::iterator::iterator(const CSVRow* _reader, int _i)
215 |         : daddy(_reader), i(_i) {
216 |         if (_i < (int)this->daddy->size())
217 |             this->field = std::make_shared<CSVField>(
218 |                 this->daddy->operator[](_i));
219 |         else
220 |             this->field = nullptr;
221 |     }
222 | 
223 |     CSV_INLINE CSVRow::iterator::reference CSVRow::iterator::operator*() const {
224 |         return *(this->field.get());
225 |     }
226 | 
227 |     CSV_INLINE CSVRow::iterator::pointer CSVRow::iterator::operator->() const {
228 |         return this->field;
229 |     }
230 | 
231 |     CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator++() {
232 |         // Pre-increment operator
233 |         this->i++;
234 |         if (this->i < (int)this->daddy->size())
235 |             this->field = std::make_shared<CSVField>(
236 |                 this->daddy->operator[](i));
237 |         else // Reached the end of row
238 |             this->field = nullptr;
239 |         return *this;
240 |     }
241 | 
242 |     CSV_INLINE CSVRow::iterator CSVRow::iterator::operator++(int) {
243 |         // Post-increment operator
244 |         auto temp = *this;
245 |         this->operator++();
246 |         return temp;
247 |     }
248 | 
249 |     CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator--() {
250 |         // Pre-decrement operator
251 |         this->i--;
252 |         this->field = std::make_shared<CSVField>(
253 |             this->daddy->operator[](this->i));
254 |         return *this;
255 |     }
256 | 
257 |     CSV_INLINE CSVRow::iterator CSVRow::iterator::operator--(int) {
258 |         // Post-decrement operator
259 |         auto temp = *this;
260 |         this->operator--();
261 |         return temp;
262 |     }
263 |     
264 |     CSV_INLINE CSVRow::iterator CSVRow::iterator::operator+(difference_type n) const {
265 |         // Allows for iterator arithmetic
266 |         return CSVRow::iterator(this->daddy, i + (int)n);
267 |     }
268 | 
269 |     CSV_INLINE CSVRow::iterator CSVRow::iterator::operator-(difference_type n) const {
270 |         // Allows for iterator arithmetic
271 |         return CSVRow::iterator::operator+(-n);
272 |     }
273 | #ifdef _MSC_VER
274 | #pragma endregion CSVRow Iterator
275 | #endif
276 | }
277 | 


--------------------------------------------------------------------------------
/include/internal/csv_stat.cpp:
--------------------------------------------------------------------------------
  1 | /** @file
  2 |  *  Calculates statistics from CSV files
  3 |  */
  4 | 
  5 | #include <string>
  6 | #include "csv_stat.hpp"
  7 | 
  8 | namespace csv {
  9 |     /** Calculate statistics for an arbitrarily large file. When this constructor
 10 |      *  is called, CSVStat will process the entire file iteratively. Once finished,
 11 |      *  methods like get_mean(), get_counts(), etc... can be used to retrieve statistics.
 12 |      */
 13 |     CSV_INLINE CSVStat::CSVStat(csv::string_view filename, CSVFormat format) :
 14 |         reader(filename, format) {
 15 |         this->calc();
 16 |     }
 17 | 
 18 |     /** Calculate statistics for a CSV stored in a std::stringstream */
 19 |     CSV_INLINE CSVStat::CSVStat(std::stringstream& stream, CSVFormat format) :
 20 |         reader(stream, format) {
 21 |         this->calc();
 22 |     }
 23 | 
 24 |     /** Return current means */
 25 |     CSV_INLINE std::vector<long double> CSVStat::get_mean() const {
 26 |         std::vector<long double> ret;        
 27 |         for (size_t i = 0; i < this->get_col_names().size(); i++) {
 28 |             ret.push_back(this->rolling_means[i]);
 29 |         }
 30 |         return ret;
 31 |     }
 32 | 
 33 |     /** Return current variances */
 34 |     CSV_INLINE std::vector<long double> CSVStat::get_variance() const {
 35 |         std::vector<long double> ret;        
 36 |         for (size_t i = 0; i < this->get_col_names().size(); i++) {
 37 |             ret.push_back(this->rolling_vars[i]/(this->n[i] - 1));
 38 |         }
 39 |         return ret;
 40 |     }
 41 | 
 42 |     /** Return current mins */
 43 |     CSV_INLINE std::vector<long double> CSVStat::get_mins() const {
 44 |         std::vector<long double> ret;        
 45 |         for (size_t i = 0; i < this->get_col_names().size(); i++) {
 46 |             ret.push_back(this->mins[i]);
 47 |         }
 48 |         return ret;
 49 |     }
 50 | 
 51 |     /** Return current maxes */
 52 |     CSV_INLINE std::vector<long double> CSVStat::get_maxes() const {
 53 |         std::vector<long double> ret;        
 54 |         for (size_t i = 0; i < this->get_col_names().size(); i++) {
 55 |             ret.push_back(this->maxes[i]);
 56 |         }
 57 |         return ret;
 58 |     }
 59 | 
 60 |     /** Get counts for each column */
 61 |     CSV_INLINE std::vector<CSVStat::FreqCount> CSVStat::get_counts() const {
 62 |         std::vector<FreqCount> ret;
 63 |         for (size_t i = 0; i < this->get_col_names().size(); i++) {
 64 |             ret.push_back(this->counts[i]);
 65 |         }
 66 |         return ret;
 67 |     }
 68 | 
 69 |     /** Get data type counts for each column */
 70 |     CSV_INLINE std::vector<CSVStat::TypeCount> CSVStat::get_dtypes() const {
 71 |         std::vector<TypeCount> ret;        
 72 |         for (size_t i = 0; i < this->get_col_names().size(); i++) {
 73 |             ret.push_back(this->dtypes[i]);
 74 |         }
 75 |         return ret;
 76 |     }
 77 | 
 78 |     CSV_INLINE void CSVStat::calc_chunk() {
 79 |         /** Only create stats counters the first time **/
 80 |         if (dtypes.empty()) {
 81 |             /** Go through all records and calculate specified statistics */
 82 |             for (size_t i = 0; i < this->get_col_names().size(); i++) {
 83 |                 dtypes.push_back({});
 84 |                 counts.push_back({});
 85 |                 rolling_means.push_back(0);
 86 |                 rolling_vars.push_back(0);
 87 |                 mins.push_back(NAN);
 88 |                 maxes.push_back(NAN);
 89 |                 n.push_back(0);
 90 |             }
 91 |         }
 92 | 
 93 |         // Start threads
 94 |         std::vector<std::thread> pool;
 95 |         for (size_t i = 0; i < this->get_col_names().size(); i++)
 96 |             pool.push_back(std::thread(&CSVStat::calc_worker, this, i));
 97 | 
 98 |         // Block until done
 99 |         for (auto& th : pool)
100 |             th.join();
101 | 
102 |         this->records.clear();
103 |     }
104 | 
105 |     CSV_INLINE void CSVStat::calc() {
106 |         constexpr size_t CALC_CHUNK_SIZE = 5000;
107 | 
108 |         for (auto& row : reader) {
109 |             this->records.push_back(std::move(row));
110 | 
111 |             /** Chunk rows */
112 |             if (this->records.size() == CALC_CHUNK_SIZE) {
113 |                 calc_chunk();
114 |             }
115 |         }
116 | 
117 |         if (!this->records.empty()) {
118 |           calc_chunk();
119 |         }
120 |     }
121 | 
122 |     CSV_INLINE void CSVStat::calc_worker(const size_t &i) {
123 |         /** Worker thread for CSVStat::calc() which calculates statistics for one column.
124 |          * 
125 |          *  @param[in] i Column index
126 |          */
127 | 
128 |         auto current_record = this->records.begin();
129 | 
130 |         for (size_t processed = 0; current_record != this->records.end(); processed++) {
131 |             if (current_record->size() == this->get_col_names().size()) {
132 |                 auto current_field = (*current_record)[i];
133 | 
134 |                 // Optimization: Don't count() if there's too many distinct values in the first 1000 rows
135 |                 if (processed < 1000 || this->counts[i].size() <= 500)
136 |                     this->count(current_field, i);
137 | 
138 |                 this->dtype(current_field, i);
139 | 
140 |                 // Numeric Stuff
141 |                 if (current_field.is_num()) {
142 |                     long double x_n = current_field.get<long double>();
143 | 
144 |                     // This actually calculates mean AND variance
145 |                     this->variance(x_n, i);
146 |                     this->min_max(x_n, i);
147 |                 }
148 |             }
149 |             else if (this->reader.get_format().get_variable_column_policy() == VariableColumnPolicy::THROW) {
150 |                 throw std::runtime_error("Line has different length than the others " + internals::format_row(*current_record));
151 |             }
152 | 
153 |             ++current_record;
154 |         }
155 |     }
156 | 
157 |     CSV_INLINE void CSVStat::dtype(CSVField& data, const size_t &i) {
158 |         /** Given a record update the type counter
159 |          *  @param[in]  record Data observation
160 |          *  @param[out] i      The column index that should be updated
161 |          */
162 |         
163 |         auto type = data.type();
164 |         if (this->dtypes[i].find(type) !=
165 |             this->dtypes[i].end()) {
166 |             // Increment count
167 |             this->dtypes[i][type]++;
168 |         } else {
169 |             // Initialize count
170 |             this->dtypes[i].insert(std::make_pair(type, 1));
171 |         }
172 |     }
173 | 
174 |     CSV_INLINE void CSVStat::count(CSVField& data, const size_t &i) {
175 |         /** Given a record update the frequency counter
176 |          *  @param[in]  record Data observation
177 |          *  @param[out] i      The column index that should be updated
178 |          */
179 | 
180 |         auto item = data.get<std::string>();
181 | 
182 |         if (this->counts[i].find(item) !=
183 |             this->counts[i].end()) {
184 |             // Increment count
185 |             this->counts[i][item]++;
186 |         } else {
187 |             // Initialize count
188 |             this->counts[i].insert(std::make_pair(item, 1));
189 |         }
190 |     }
191 | 
192 |     CSV_INLINE void CSVStat::min_max(const long double &x_n, const size_t &i) {
193 |         /** Update current minimum and maximum
194 |          *  @param[in]  x_n Data observation
195 |          *  @param[out] i   The column index that should be updated
196 |          */
197 |         if (std::isnan(this->mins[i]))
198 |             this->mins[i] = x_n;
199 |         if (std::isnan(this->maxes[i]))
200 |             this->maxes[i] = x_n;
201 |         
202 |         if (x_n < this->mins[i])
203 |             this->mins[i] = x_n;
204 |         else if (x_n > this->maxes[i])
205 |             this->maxes[i] = x_n;
206 |     }
207 | 
208 |     CSV_INLINE void CSVStat::variance(const long double &x_n, const size_t &i) {
209 |         /** Given a record update rolling mean and variance for all columns
210 |          *  using Welford's Algorithm
211 |          *  @param[in]  x_n Data observation
212 |          *  @param[out] i   The column index that should be updated
213 |          */
214 |         long double& current_rolling_mean = this->rolling_means[i];
215 |         long double& current_rolling_var = this->rolling_vars[i];
216 |         long double& current_n = this->n[i];
217 |         long double delta;
218 |         long double delta2;
219 | 
220 |         current_n++;
221 |         
222 |         if (current_n == 1) {
223 |             current_rolling_mean = x_n;
224 |         } else {
225 |             delta = x_n - current_rolling_mean;
226 |             current_rolling_mean += delta/current_n;
227 |             delta2 = x_n - current_rolling_mean;
228 |             current_rolling_var += delta*delta2;
229 |         }
230 |     }
231 | 
232 |     /** Useful for uploading CSV files to SQL databases.
233 |      *
234 |      *  Return a data type for each column such that every value in a column can be
235 |      *  converted to the corresponding data type without data loss.
236 |      *  @param[in]  filename The CSV file
237 |      *
238 |      *  \return A mapping of column names to csv::DataType enums
239 |      */
240 |     CSV_INLINE std::unordered_map<std::string, DataType> csv_data_types(const std::string& filename) {
241 |         CSVStat stat(filename);
242 |         std::unordered_map<std::string, DataType> csv_dtypes;
243 | 
244 |         auto col_names = stat.get_col_names();
245 |         auto temp = stat.get_dtypes();
246 | 
247 |         for (size_t i = 0; i < stat.get_col_names().size(); i++) {
248 |             auto& col = temp[i];
249 |             auto& col_name = col_names[i];
250 | 
251 |             if (col[DataType::CSV_STRING])
252 |                 csv_dtypes[col_name] = DataType::CSV_STRING;
253 |             else if (col[DataType::CSV_INT64])
254 |                 csv_dtypes[col_name] = DataType::CSV_INT64;
255 |             else if (col[DataType::CSV_INT32])
256 |                 csv_dtypes[col_name] = DataType::CSV_INT32;
257 |             else if (col[DataType::CSV_INT16])
258 |                 csv_dtypes[col_name] = DataType::CSV_INT16;
259 |             else if (col[DataType::CSV_INT8])
260 |                 csv_dtypes[col_name] = DataType::CSV_INT8;
261 |             else
262 |                 csv_dtypes[col_name] = DataType::CSV_DOUBLE;
263 |         }
264 | 
265 |         return csv_dtypes;
266 |     }
267 | }


--------------------------------------------------------------------------------
/include/internal/basic_csv_parser.cpp:
--------------------------------------------------------------------------------
  1 | #include "basic_csv_parser.hpp"
  2 | 
  3 | namespace csv {
  4 |     namespace internals {
  5 |         CSV_INLINE size_t get_file_size(csv::string_view filename) {
  6 |             std::ifstream infile(std::string(filename), std::ios::binary);
  7 |             const auto start = infile.tellg();
  8 |             infile.seekg(0, std::ios::end);
  9 |             const auto end = infile.tellg();
 10 | 
 11 |             return end - start;
 12 |         }
 13 | 
 14 |         CSV_INLINE std::string get_csv_head(csv::string_view filename) {
 15 |             return get_csv_head(filename, get_file_size(filename));
 16 |         }
 17 | 
 18 |         CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) {
 19 |             const size_t bytes = 500000;
 20 | 
 21 |             std::error_code error;
 22 |             size_t length = std::min((size_t)file_size, bytes);
 23 |             auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error);
 24 | 
 25 |             if (error) {
 26 |                 throw std::runtime_error("Cannot open file " + std::string(filename));
 27 |             }
 28 | 
 29 |             return std::string(mmap.begin(), mmap.end());
 30 |         }
 31 | 
 32 | #ifdef _MSC_VER
 33 | #pragma region IBasicCVParser
 34 | #endif
 35 |         CSV_INLINE IBasicCSVParser::IBasicCSVParser(
 36 |             const CSVFormat& format,
 37 |             const ColNamesPtr& col_names
 38 |         ) : _col_names(col_names) {
 39 |             if (format.no_quote) {
 40 |                 _parse_flags = internals::make_parse_flags(format.get_delim());
 41 |             }
 42 |             else {
 43 |                 _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char);
 44 |             }
 45 | 
 46 |             _ws_flags = internals::make_ws_flags(
 47 |                 format.trim_chars.data(), format.trim_chars.size()
 48 |             );
 49 |         }
 50 | 
 51 |         CSV_INLINE void IBasicCSVParser::end_feed() {
 52 |             using internals::ParseFlags;
 53 | 
 54 |             bool empty_last_field = this->data_ptr
 55 |                 && this->data_ptr->_data
 56 |                 && !this->data_ptr->data.empty()
 57 |                 && (parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER
 58 |                     || parse_flag(this->data_ptr->data.back()) == ParseFlags::QUOTE);
 59 | 
 60 |             // Push field
 61 |             if (this->field_length > 0 || empty_last_field) {
 62 |                 this->push_field();
 63 |             }
 64 | 
 65 |             // Push row
 66 |             if (this->current_row.size() > 0)
 67 |                 this->push_row();
 68 |         }
 69 | 
 70 |         CSV_INLINE void IBasicCSVParser::parse_field() noexcept {
 71 |             using internals::ParseFlags;
 72 |             auto& in = this->data_ptr->data;
 73 | 
 74 |             // Trim off leading whitespace
 75 |             while (data_pos < in.size() && ws_flag(in[data_pos]))
 76 |                 data_pos++;
 77 | 
 78 |             if (field_start == UNINITIALIZED_FIELD)
 79 |                 field_start = (int)(data_pos - current_row_start());
 80 | 
 81 |             // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
 82 |             // sequences, use the loop below to avoid having to go through the outer
 83 |             // switch statement as much as possible
 84 |             while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL)
 85 |                 data_pos++;
 86 | 
 87 |             field_length = data_pos - (field_start + current_row_start());
 88 | 
 89 |             // Trim off trailing whitespace, this->field_length constraint matters
 90 |             // when field is entirely whitespace
 91 |             for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--)
 92 |                 this->field_length--;
 93 |         }
 94 | 
 95 |         CSV_INLINE void IBasicCSVParser::push_field()
 96 |         {
 97 |             // Update
 98 |             if (field_has_double_quote) {
 99 |                 fields->emplace_back(
100 |                     field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start,
101 |                     field_length,
102 |                     true
103 |                 );
104 |                 field_has_double_quote = false;
105 | 
106 |             }
107 |             else {
108 |                 fields->emplace_back(
109 |                     field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start,
110 |                     field_length
111 |                 );
112 |             }
113 | 
114 |             current_row.row_length++;
115 | 
116 |             // Reset field state
117 |             field_start = UNINITIALIZED_FIELD;
118 |             field_length = 0;
119 |         }
120 | 
121 |         /** @return The number of characters parsed that belong to complete rows */
122 |         CSV_INLINE size_t IBasicCSVParser::parse()
123 |         {
124 |             using internals::ParseFlags;
125 | 
126 |             this->quote_escape = false;
127 |             this->data_pos = 0;
128 |             this->current_row_start() = 0;
129 |             this->trim_utf8_bom();
130 | 
131 |             auto& in = this->data_ptr->data;
132 |             while (this->data_pos < in.size()) {
133 |                 switch (compound_parse_flag(in[this->data_pos])) {
134 |                 case ParseFlags::DELIMITER:
135 |                     this->push_field();
136 |                     this->data_pos++;
137 |                     break;
138 | 
139 |                 case ParseFlags::NEWLINE:
140 |                     this->data_pos++;
141 | 
142 |                     // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines)
143 |                     while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE)
144 |                         this->data_pos++;
145 | 
146 |                     // End of record -> Write record
147 |                     this->push_field();
148 |                     this->push_row();
149 | 
150 |                     // Reset
151 |                     this->current_row = CSVRow(data_ptr, this->data_pos, fields->size());
152 |                     break;
153 | 
154 |                 case ParseFlags::NOT_SPECIAL:
155 |                     this->parse_field();
156 |                     break;
157 | 
158 |                 case ParseFlags::QUOTE_ESCAPE_QUOTE:
159 |                     if (data_pos + 1 == in.size()) return this->current_row_start();
160 |                     else if (data_pos + 1 < in.size()) {
161 |                         auto next_ch = parse_flag(in[data_pos + 1]);
162 |                         if (next_ch >= ParseFlags::DELIMITER) {
163 |                             quote_escape = false;
164 |                             data_pos++;
165 |                             break;
166 |                         }
167 |                         else if (next_ch == ParseFlags::QUOTE) {
168 |                             // Case: Escaped quote
169 |                             data_pos += 2;
170 |                             this->field_length += 2;
171 |                             this->field_has_double_quote = true;
172 |                             break;
173 |                         }
174 |                     }
175 |                     
176 |                     // Case: Unescaped single quote => not strictly valid but we'll keep it
177 |                     this->field_length++;
178 |                     data_pos++;
179 | 
180 |                     break;
181 | 
182 |                 default: // Quote (currently not quote escaped)
183 |                     if (this->field_length == 0) {
184 |                         quote_escape = true;
185 |                         data_pos++;
186 |                         if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos]))
187 |                             field_start = (int)(data_pos - current_row_start());
188 |                         break;
189 |                     }
190 | 
191 |                     // Case: Unescaped quote
192 |                     this->field_length++;
193 |                     data_pos++;
194 | 
195 |                     break;
196 |                 }
197 |             }
198 | 
199 |             return this->current_row_start();
200 |         }
201 | 
202 |         CSV_INLINE void IBasicCSVParser::push_row() {
203 |             current_row.row_length = fields->size() - current_row.fields_start;
204 |             this->_records->push_back(std::move(current_row));
205 |         }
206 | 
207 |         CSV_INLINE void IBasicCSVParser::reset_data_ptr() {
208 |             this->data_ptr = std::make_shared<RawCSVData>();
209 |             this->data_ptr->parse_flags = this->_parse_flags;
210 |             this->data_ptr->col_names = this->_col_names;
211 |             this->fields = &(this->data_ptr->fields);
212 |         }
213 | 
214 |         CSV_INLINE void IBasicCSVParser::trim_utf8_bom() {
215 |             auto& data = this->data_ptr->data;
216 | 
217 |             if (!this->unicode_bom_scan && data.size() >= 3) {
218 |                 if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') {
219 |                     this->data_pos += 3; // Remove BOM from input string
220 |                     this->_utf8_bom = true;
221 |                 }
222 | 
223 |                 this->unicode_bom_scan = true;
224 |             }
225 |         }
226 | #ifdef _MSC_VER
227 | #pragma endregion
228 | #endif
229 | 
230 | #ifdef _MSC_VER
231 | #pragma region Specializations
232 | #endif
233 |         CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) {
234 |             // Reset parser state
235 |             this->field_start = UNINITIALIZED_FIELD;
236 |             this->field_length = 0;
237 |             this->reset_data_ptr();
238 | 
239 |             // Create memory map
240 |             size_t length = std::min(this->source_size - this->mmap_pos, bytes);
241 |             std::error_code error;
242 |             this->data_ptr->_data = std::make_shared<mio::basic_mmap_source<char>>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error));
243 |             this->mmap_pos += length;
244 |             if (error) throw error;
245 | 
246 |             auto mmap_ptr = (mio::basic_mmap_source<char>*)(this->data_ptr->_data.get());
247 | 
248 |             // Create string view
249 |             this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length());
250 | 
251 |             // Parse
252 |             this->current_row = CSVRow(this->data_ptr);
253 |             size_t remainder = this->parse();            
254 | 
255 |             if (this->mmap_pos == this->source_size || no_chunk()) {
256 |                 this->_eof = true;
257 |                 this->end_feed();
258 |             }
259 | 
260 |             this->mmap_pos -= (length - remainder);
261 |         }
262 | #ifdef _MSC_VER
263 | #pragma endregion
264 | #endif
265 |     }
266 | }
267 | 


--------------------------------------------------------------------------------
/include/internal/csv_reader.cpp:
--------------------------------------------------------------------------------
  1 | /** @file
  2 |  *  @brief Defines functionality needed for basic CSV parsing
  3 |  */
  4 | 
  5 | #include "csv_reader.hpp"
  6 | 
  7 | namespace csv {
  8 |     namespace internals {
  9 |         CSV_INLINE std::string format_row(const std::vector<std::string>& row, csv::string_view delim) {
 10 |             /** Print a CSV row */
 11 |             std::stringstream ret;
 12 |             for (size_t i = 0; i < row.size(); i++) {
 13 |                 ret << row[i];
 14 |                 if (i + 1 < row.size()) ret << delim;
 15 |                 else ret << '\n';
 16 |             }
 17 |             ret.flush();
 18 | 
 19 |             return ret.str();
 20 |         }
 21 | 
 22 |         /** Return a CSV's column names
 23 |          *
 24 |          *  @param[in] filename  Path to CSV file
 25 |          *  @param[in] format    Format of the CSV file
 26 |          *
 27 |          */
 28 |         CSV_INLINE std::vector<std::string> _get_col_names(csv::string_view head, CSVFormat format) {
 29 |             // Parse the CSV
 30 |             auto trim_chars = format.get_trim_chars();
 31 |             std::stringstream source(head.data());
 32 |             RowCollection rows;
 33 | 
 34 |             StreamParser<std::stringstream> parser(source, format);
 35 |             parser.set_output(rows);
 36 |             parser.next();
 37 | 
 38 |             return CSVRow(std::move(rows[format.get_header()]));
 39 |         }
 40 | 
 41 |         CSV_INLINE GuessScore calculate_score(csv::string_view head, const CSVFormat& format) {
 42 |             // Frequency counter of row length
 43 |             std::unordered_map<size_t, size_t> row_tally = { { 0, 0 } };
 44 | 
 45 |             // Map row lengths to row num where they first occurred
 46 |             std::unordered_map<size_t, size_t> row_when = { { 0, 0 } };
 47 | 
 48 |             // Parse the CSV
 49 |             std::stringstream source(head.data());
 50 |             RowCollection rows;
 51 | 
 52 |             StreamParser<std::stringstream> parser(source, format);
 53 |             parser.set_output(rows);
 54 |             parser.next();
 55 | 
 56 |             for (size_t i = 0; i < rows.size(); i++) {
 57 |                 auto& row = rows[i];
 58 | 
 59 |                 // Ignore zero-length rows
 60 |                 if (row.size() > 0) {
 61 |                     if (row_tally.find(row.size()) != row_tally.end()) {
 62 |                         row_tally[row.size()]++;
 63 |                     }
 64 |                     else {
 65 |                         row_tally[row.size()] = 1;
 66 |                         row_when[row.size()] = i;
 67 |                     }
 68 |                 }
 69 |             }
 70 | 
 71 |             double final_score = 0;
 72 |             size_t header_row = 0;
 73 | 
 74 |             // Final score is equal to the largest
 75 |             // row size times rows of that size
 76 |             for (auto& pair : row_tally) {
 77 |                 auto row_size = pair.first;
 78 |                 auto row_count = pair.second;
 79 |                 double score = (double)(row_size * row_count);
 80 |                 if (score > final_score) {
 81 |                     final_score = score;
 82 |                     header_row = row_when[row_size];
 83 |                 }
 84 |             }
 85 | 
 86 |             return {
 87 |                 final_score,
 88 |                 header_row
 89 |             };
 90 |         }
 91 | 
 92 |         /** Guess the delimiter used by a delimiter-separated values file */
 93 |         CSV_INLINE CSVGuessResult _guess_format(csv::string_view head, const std::vector<char>& delims) {
 94 |             /** For each delimiter, find out which row length was most common.
 95 |              *  The delimiter with the longest mode row length wins.
 96 |              *  Then, the line number of the header row is the first row with
 97 |              *  the mode row length.
 98 |              */
 99 | 
100 |             CSVFormat format;
101 |             size_t max_score = 0,
102 |                 header = 0;
103 |             char current_delim = delims[0];
104 | 
105 |             for (char cand_delim : delims) {
106 |                 auto result = calculate_score(head, format.delimiter(cand_delim));
107 | 
108 |                 if ((size_t)result.score > max_score) {
109 |                     max_score = (size_t)result.score;
110 |                     current_delim = cand_delim;
111 |                     header = result.header;
112 |                 }
113 |             }
114 | 
115 |             return { current_delim, (int)header };
116 |         }
117 |     }
118 | 
119 |     /** Return a CSV's column names
120 |      *
121 |      *  @param[in] filename  Path to CSV file
122 |      *  @param[in] format    Format of the CSV file
123 |      *
124 |      */
125 |     CSV_INLINE std::vector<std::string> get_col_names(csv::string_view filename, CSVFormat format) {
126 |         auto head = internals::get_csv_head(filename);
127 | 
128 |         /** Guess delimiter and header row */
129 |         if (format.guess_delim()) {
130 |             auto guess_result = guess_format(filename, format.get_possible_delims());
131 |             format.delimiter(guess_result.delim).header_row(guess_result.header_row);
132 |         }
133 | 
134 |         return internals::_get_col_names(head, format);
135 |     }
136 | 
137 |     /** Guess the delimiter used by a delimiter-separated values file */
138 |     CSV_INLINE CSVGuessResult guess_format(csv::string_view filename, const std::vector<char>& delims) {
139 |         auto head = internals::get_csv_head(filename);
140 |         return internals::_guess_format(head, delims);
141 |     }
142 | 
143 |     /** Reads an arbitrarily large CSV file using memory-mapped IO.
144 |      *
145 |      *  **Details:** Reads the first block of a CSV file synchronously to get information
146 |      *               such as column names and delimiting character.
147 |      *
148 |      *  @param[in] filename  Path to CSV file
149 |      *  @param[in] format    Format of the CSV file
150 |      *
151 |      *  \snippet tests/test_read_csv.cpp CSVField Example
152 |      *
153 |      */
154 | 	CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : _format(format) {
155 |         auto head = internals::get_csv_head(filename);
156 |         using Parser = internals::MmapParser;
157 | 
158 |         /** Guess delimiter and header row */
159 |         if (format.guess_delim()) {
160 |             auto guess_result = internals::_guess_format(head, format.possible_delimiters);
161 |             format.delimiter(guess_result.delim);
162 |             format.header = guess_result.header_row;
163 |             this->_format = format;
164 |         }
165 | 
166 |         if (!format.col_names.empty())
167 |             this->set_col_names(format.col_names);
168 | 
169 |         this->parser = std::unique_ptr<Parser>(new Parser(filename, format, this->col_names)); // For C++11
170 |         this->initial_read();
171 |     }
172 | 
173 |     /** Return the format of the original raw CSV */
174 |     CSV_INLINE CSVFormat CSVReader::get_format() const {
175 |         CSVFormat new_format = this->_format;
176 | 
177 |         // Since users are normally not allowed to set
178 |         // column names and header row simulatenously,
179 |         // we will set the backing variables directly here
180 |         new_format.col_names = this->col_names->get_col_names();
181 |         new_format.header = this->_format.header;
182 | 
183 |         return new_format;
184 |     }
185 | 
186 |     /** Return the CSV's column names as a vector of strings. */
187 |     CSV_INLINE std::vector<std::string> CSVReader::get_col_names() const {
188 |         if (this->col_names) {
189 |             return this->col_names->get_col_names();
190 |         }
191 | 
192 |         return std::vector<std::string>();
193 |     }
194 | 
195 |     /** Return the index of the column name if found or
196 |      *         csv::CSV_NOT_FOUND otherwise.
197 |      */
198 |     CSV_INLINE int CSVReader::index_of(csv::string_view col_name) const {
199 |         auto _col_names = this->get_col_names();
200 |         for (size_t i = 0; i < _col_names.size(); i++)
201 |             if (_col_names[i] == col_name) return (int)i;
202 | 
203 |         return CSV_NOT_FOUND;
204 |     }
205 | 
206 |     CSV_INLINE void CSVReader::trim_header() {
207 |         if (!this->header_trimmed) {
208 |             for (int i = 0; i <= this->_format.header && !this->records->empty(); i++) {
209 |                 if (i == this->_format.header && this->col_names->empty()) {
210 |                     this->set_col_names(this->records->pop_front());
211 |                 }
212 |                 else {
213 |                     this->records->pop_front();
214 |                 }
215 |             }
216 | 
217 |             this->header_trimmed = true;
218 |         }
219 |     }
220 | 
221 |     /**
222 |      *  @param[in] names Column names
223 |      */
224 |     CSV_INLINE void CSVReader::set_col_names(const std::vector<std::string>& names)
225 |     {
226 |         this->col_names->set_col_names(names);
227 |         this->n_cols = names.size();
228 |     }
229 | 
230 |     /**
231 |      * Read a chunk of CSV data.
232 |      *
233 |      * @note This method is meant to be run on its own thread. Only one `read_csv()` thread
234 |      *       should be active at a time.
235 |      *
236 |      * @param[in] bytes Number of bytes to read.
237 |      *
238 |      * @see CSVReader::read_csv_worker
239 |      * @see CSVReader::read_row()
240 |      */
241 |     CSV_INLINE bool CSVReader::read_csv(size_t bytes) {
242 |         // Tell read_row() to listen for CSV rows
243 |         this->records->notify_all();
244 | 
245 |         this->parser->set_output(*this->records);
246 |         this->parser->next(bytes);
247 | 
248 |         if (!this->header_trimmed) {
249 |             this->trim_header();
250 |         }
251 | 
252 |         // Tell read_row() to stop waiting
253 |         this->records->kill_all();
254 | 
255 |         return true;
256 |     }
257 | 
258 |     /**
259 |      * Retrieve rows as CSVRow objects, returning true if more rows are available.
260 |      *
261 |      * @par Performance Notes
262 |      *  - Reads chunks of data that are csv::internals::ITERATION_CHUNK_SIZE bytes large at a time
263 |      *  - For performance details, read the documentation for CSVRow and CSVField.
264 |      *
265 |      * @param[out] row The variable where the parsed row will be stored
266 |      * @see CSVRow, CSVField
267 |      *
268 |      * **Example:**
269 |      * \snippet tests/test_read_csv.cpp CSVField Example
270 |      *
271 |      */
272 |     CSV_INLINE bool CSVReader::read_row(CSVRow &row) {
273 |         while (true) {
274 |             if (this->records->empty()) {
275 |                 if (this->records->is_waitable())
276 |                     // Reading thread is currently active => wait for it to populate records
277 |                     this->records->wait();
278 |                 else if (this->parser->eof())
279 |                     // End of file and no more records
280 |                     return false;
281 |                 else {
282 |                     // Reading thread is not active => start another one
283 |                     if (this->read_csv_worker.joinable())
284 |                         this->read_csv_worker.join();
285 | 
286 |                     this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE);
287 |                 }
288 |             }
289 |             else if (this->records->front().size() != this->n_cols &&
290 |                 this->_format.variable_column_policy != VariableColumnPolicy::KEEP) {
291 |                 auto errored_row = this->records->pop_front();
292 | 
293 |                 if (this->_format.variable_column_policy == VariableColumnPolicy::THROW) {
294 |                     if (errored_row.size() < this->n_cols)
295 |                         throw std::runtime_error("Line too short " + internals::format_row(errored_row));
296 | 
297 |                     throw std::runtime_error("Line too long " + internals::format_row(errored_row));
298 |                 }
299 |             }
300 |             else {
301 |                 row = this->records->pop_front();
302 |                 this->_n_rows++;
303 |                 return true;
304 |             }
305 |         }
306 | 
307 |         return false;
308 |     }
309 | }
310 | 


--------------------------------------------------------------------------------