├── single_include_test ├── my_header.hpp ├── README.md ├── CMakeLists.txt ├── file1.cpp └── file2.cpp ├── python ├── csvpy │ ├── __init__.py │ └── DictReader.py ├── examples │ ├── PyDemo2.py │ ├── PyDemo7.py │ ├── PyDemo6.py │ ├── PyDemo5.py │ ├── PyDemo1.py │ ├── PyDemo3.py │ └── PyDemo4.py ├── CMakeLists.txt └── csvpy.cpp ├── codecov.yml ├── tests ├── main.cpp ├── test_csv_delimeter.cpp ├── test_guess_csv.cpp ├── CMakeLists.txt ├── shared │ └── float_test_cases.hpp ├── test_csv_ranges.cpp ├── test_round_trip.cpp ├── test_csv_format.cpp ├── test_csv_row.cpp ├── test_csv_field_array.cpp ├── test_csv_stat.cpp ├── test_csv_row_json.cpp ├── test_csv_iterator.cpp ├── test_write_csv.cpp ├── test_data_type.cpp ├── test_read_csv_file.cpp ├── test_csv_field.cpp └── test_raw_csv_data.cpp ├── .gitmodules ├── cpp.hint ├── .gitattributes ├── docs └── source │ ├── variable_row_lengths.md │ ├── scientific_notation.md │ └── Doxy.md ├── programs ├── csv_bench.py ├── csv_info.cpp ├── round_trip.cpp ├── csv_stats.cpp ├── csv_generator.cpp ├── csv_guess_bench.cpp ├── csv_bench.cpp ├── CMakeLists.txt └── data_type_bench.cpp ├── include ├── internal │ ├── CMakeLists.txt │ ├── col_names.cpp │ ├── col_names.hpp │ ├── csv_utility.hpp │ ├── csv_stat.hpp │ ├── csv_reader_iterator.cpp │ ├── csv_utility.cpp │ ├── csv_format.cpp │ ├── csv_format.hpp │ ├── common.hpp │ ├── csv_row_json.cpp │ ├── csv_reader.hpp │ ├── csv_row.cpp │ ├── csv_stat.cpp │ ├── basic_csv_parser.cpp │ └── csv_reader.cpp └── csv.hpp ├── CMakeSettings.json ├── LICENSE ├── .gitignore ├── Makefile ├── .travis.yml ├── .github └── workflows │ └── cmake-multi-platform.yml ├── CMakeLists.txt └── single_header.py /single_include_test/my_header.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "csv.hpp" -------------------------------------------------------------------------------- /python/csvpy/__init__.py: -------------------------------------------------------------------------------- 1 | from .csvpy import Reader 2 | from .DictReader import DictReader -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | ignore: 2 | - "include/external" 3 | - "tests" 4 | coverage: 5 | status: 6 | project: 7 | default: 8 | target: 95% -------------------------------------------------------------------------------- /tests/main.cpp: -------------------------------------------------------------------------------- 1 | #define CATCH_CONFIG_MAIN 2 | 3 | // For Catch + MSVC 4 | #define _SILENCE_CXX17_UNCAUGHT_EXCEPTION_DEPRECATION_WARNING 5 | 6 | #include -------------------------------------------------------------------------------- /single_include_test/README.md: -------------------------------------------------------------------------------- 1 | # Purpose 2 | 3 | The purpose of this directory is to make sure that the single header 4 | `csv.hpp` file does not cause compile errors when `#include`d from multiple 5 | .cpp files. -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tests/data"] 2 | path = tests/data 3 | url = https://github.com/vincentlaucsb/csv-data.git 4 | [submodule "python/pybind11"] 5 | path = python/pybind11 6 | url = https://github.com/pybind/pybind11.git 7 | -------------------------------------------------------------------------------- /cpp.hint: -------------------------------------------------------------------------------- 1 | // Hint files help the Visual Studio IDE interpret Visual C++ identifiers 2 | // such as names of functions and macros. 3 | // For more information see https://go.microsoft.com/fwlink/?linkid=865984 4 | #define CONSTEXPR 5 | -------------------------------------------------------------------------------- /python/examples/PyDemo2.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import csvpy 3 | 4 | path = Path(__file__).parent.parent / 'data' / '2015_StateDepartment.csv' 5 | reader = csvpy.Reader(str(path)) 6 | 7 | for row in reader: 8 | row['Year'].get_int() 9 | # row[0].get_int() 10 | -------------------------------------------------------------------------------- /python/examples/PyDemo7.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import csvpy 3 | 4 | path = Path(__file__).parent.parent / 'data' / '2015_StateDepartment.csv' 5 | 6 | info = csvpy.get_file_info(str(path)) 7 | 8 | print(info.filename) 9 | print(info.col_names) 10 | print(info.delim) 11 | print(info.n_rows) 12 | print(info.n_cols) -------------------------------------------------------------------------------- /single_include_test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # The purpose of this executable is to make sure it successfully compiles 2 | add_executable(single_include_test "") 3 | target_sources(single_include_test 4 | PRIVATE 5 | file1.cpp 6 | file2.cpp 7 | ) 8 | target_link_libraries(single_include_test PRIVATE Threads::Threads) 9 | -------------------------------------------------------------------------------- /python/examples/PyDemo6.py: -------------------------------------------------------------------------------- 1 | import csvpy 2 | 3 | format = csvpy.Format().delimiter(',') 4 | reader = csvpy.parse( 5 | 'Name, Age\nHussein Sarea, 22\nMoataz Sarea, 21', 6 | format 7 | ) 8 | # reader = csvpy.parse_no_header( 9 | # 'Name, Age\nHussein Sarea, 22\nMoataz Sarea, 21', 10 | # ) 11 | for r in reader: 12 | print(r[1].get_str()) 13 | -------------------------------------------------------------------------------- /python/examples/PyDemo5.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import csvpy 3 | 4 | path = Path(__file__).parent.parent / 'data' / '2015_StateDepartment.csv' 5 | format = csvpy.Format() 6 | format.delimiter(',').quote('"').header_row(2) 7 | reader = csvpy.Reader(str(path), format) 8 | for row in reader: 9 | # Do stuff with rows here 10 | print(row[1].get_str()) 11 | -------------------------------------------------------------------------------- /python/examples/PyDemo1.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import csvpy 3 | 4 | path = Path(__file__).parent.parent / 'data' / '2015_StateDepartment.csv' 5 | reader = csvpy.Reader(str(path)) 6 | 7 | for row in reader: 8 | for field in row: 9 | # field.get_int() 10 | # field.get_float() 11 | # field.get_double() 12 | # field.get_sv() 13 | print(field.get_str()) 14 | 15 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /docs/source/variable_row_lengths.md: -------------------------------------------------------------------------------- 1 | # Dealing with Variable Length CSV Rows 2 | 3 | `csv::CSVReader` generally assumes that most rows in a CSV are of the same length. 4 | If your CSV has important data stored in rows which may not be the same length 5 | as the others, then you may want to create your own subclass of CSVReader and 6 | override `bad_row_handler`. 7 | 8 | ## Examples 9 | * csv::CSVReader::bad_row_handler 10 | * csv::internals::CSVGuesser::Guesser::bad_row_handler() -------------------------------------------------------------------------------- /python/examples/PyDemo3.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import csvpy 3 | 4 | path = Path(__file__).parent.parent / 'data' / '2015_StateDepartment.csv' 5 | reader = csvpy.Reader(str(path)) 6 | 7 | for row in reader: 8 | if row['Year'].is_int(): 9 | row['Year'].get_int() 10 | elif row['Year'].is_float(): 11 | row['Year'].get_float() 12 | elif row['Year'].is_str(): 13 | row['Year'].get_str() 14 | elif row['Year'].is_null(): 15 | pass 16 | -------------------------------------------------------------------------------- /python/examples/PyDemo4.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import csvpy 3 | 4 | path = Path(__file__).parent.parent / 'data' / '2015_StateDepartment.csv' 5 | reader = csvpy.Reader(str(path)) 6 | 7 | # for row in reader: 8 | # print(row.to_json()) 9 | # print(row.to_json_array()) 10 | 11 | for row in reader: 12 | # You can pass in a list of column names to slice or rearrange the outputted JSON 13 | print(row.to_json(['Entity Type', 'Year'])) 14 | print(row.to_json_array(['Year', 'Entity Type'])) -------------------------------------------------------------------------------- /programs/csv_bench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | 4 | parser = argparse.ArgumentParser(description='Count the number of lines in a CSV') 5 | parser.add_argument('file', type=str, nargs=1, 6 | help='File to parse') 7 | parser.add_argument('encoding', nargs='?', type=str, default='utf-8', 8 | help='File encoding') 9 | 10 | args = parser.parse_args() 11 | file = args.file[0] 12 | enc = args.encoding 13 | 14 | j = 0 15 | with open(file, 'r', encoding=enc) as csv_file: 16 | reader = csv.reader(csv_file) 17 | for i in reader: 18 | j += 1 19 | 20 | print(j) -------------------------------------------------------------------------------- /programs/csv_info.cpp: -------------------------------------------------------------------------------- 1 | #include "csv.hpp" 2 | #include 3 | 4 | int main(int argc, char** argv) { 5 | using namespace csv; 6 | 7 | if (argc < 2) { 8 | std::cout << "Usage: " << argv[0] << " [file]" << std::endl; 9 | exit(1); 10 | } 11 | 12 | std::string file = argv[1]; 13 | auto info = get_file_info(file); 14 | 15 | std::cout << file << std::endl 16 | << "Columns: " << internals::format_row(info.col_names, ", ") 17 | << "Dimensions: " << info.n_rows << " rows x " << info.n_cols << " columns" << std::endl 18 | << "Delimiter: " << info.delim << std::endl; 19 | 20 | return 0; 21 | } -------------------------------------------------------------------------------- /single_include_test/file1.cpp: -------------------------------------------------------------------------------- 1 | #include "my_header.hpp" 2 | #include 3 | 4 | int foobar(int argc, char** argv) { 5 | using namespace csv; 6 | 7 | if (argc < 2) { 8 | std::cout << "Usage: " << argv[0] << " [file]" << std::endl; 9 | exit(1); 10 | } 11 | 12 | std::string file = argv[1]; 13 | auto info = get_file_info(file); 14 | 15 | std::cout << file << std::endl 16 | << "Columns: " << internals::format_row(info.col_names, ", ") 17 | << "Dimensions: " << info.n_rows << " rows x " << info.n_cols << " columns" << std::endl 18 | << "Delimiter: " << info.delim << std::endl; 19 | 20 | return 0; 21 | } -------------------------------------------------------------------------------- /include/internal/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(csv STATIC "") 2 | 3 | target_sources(csv 4 | PRIVATE 5 | basic_csv_parser.hpp 6 | basic_csv_parser.cpp 7 | col_names.cpp 8 | col_names.hpp 9 | common.hpp 10 | csv_format.hpp 11 | csv_format.cpp 12 | csv_reader.hpp 13 | csv_reader.cpp 14 | csv_reader_iterator.cpp 15 | csv_row.hpp 16 | csv_row.cpp 17 | csv_row_json.cpp 18 | csv_stat.cpp 19 | csv_stat.hpp 20 | csv_utility.cpp 21 | csv_utility.hpp 22 | csv_writer.hpp 23 | "data_type.hpp" 24 | ) 25 | 26 | set_target_properties(csv PROPERTIES LINKER_LANGUAGE CXX) 27 | target_link_libraries(csv PRIVATE Threads::Threads) 28 | target_include_directories(csv INTERFACE ../) 29 | -------------------------------------------------------------------------------- /programs/round_trip.cpp: -------------------------------------------------------------------------------- 1 | #include "csv.hpp" 2 | #include 3 | 4 | int main(int argc, char** argv) { 5 | using namespace csv; 6 | 7 | if (argc < 3) { 8 | std::cout << "Usage: " << argv[0] << " [file] [out]" << std::endl; 9 | exit(1); 10 | } 11 | 12 | std::string file = argv[1]; 13 | std::string out = argv[2]; 14 | 15 | std::ofstream outfile(out); 16 | auto writer = make_csv_writer(outfile); 17 | 18 | CSVFormat format; 19 | format.variable_columns(true); 20 | CSVReader reader(file, format); 21 | writer << reader.get_col_names(); 22 | 23 | for (auto& row: reader) { 24 | writer << std::vector(row); 25 | } 26 | 27 | return 0; 28 | } -------------------------------------------------------------------------------- /docs/source/scientific_notation.md: -------------------------------------------------------------------------------- 1 | # Scientific Notation Parsing 2 | 3 | This library has support for parsing scientific notation through `csv::internals::data_type()`, 4 | which is in turned called by `csv::CSVField::get()` when used with a floating point value type 5 | as the template parameter. Malformed scientific notation will be interpreted by this 6 | library as a regular string. 7 | 8 | ## Examples 9 | \snippet tests/test_data_type.cpp Parse Scientific Notation 10 | 11 | ## Supported Flavors 12 | 13 | Many different variations of E-notation are supported, as long as there isn't a whitespace 14 | between E and the successive exponent. As seen below, the `+` sign is optional, and any number of 15 | zeroes is accepted. 16 | 17 | \snippet tests/test_data_type.cpp Scientific Notation Flavors -------------------------------------------------------------------------------- /tests/test_csv_delimeter.cpp: -------------------------------------------------------------------------------- 1 | #include "csv.hpp" 2 | #include 3 | #include 4 | #include 5 | 6 | TEST_CASE("Test delim from file", "[test_csv_reader_get_format_get_delim_from_file]") { 7 | csv::CSVReader reader("./tests/data/fake_data/delimeter.csv"); 8 | char delim = reader.get_format().get_delim(); 9 | REQUIRE(delim == ';'); 10 | } 11 | 12 | TEST_CASE("Test delim from string", "[test_csv_reader_get_format_get_delim_from_string]") { 13 | std::ifstream file_stream("./tests/data/fake_data/delimeter.csv"); 14 | std::string csv_data((std::istreambuf_iterator(file_stream)), std::istreambuf_iterator()); 15 | std::stringstream ss(csv_data); 16 | 17 | csv::CSVReader reader(ss); 18 | char delim = reader.get_format().get_delim(); 19 | REQUIRE(delim == ';'); 20 | } 21 | -------------------------------------------------------------------------------- /programs/csv_stats.cpp: -------------------------------------------------------------------------------- 1 | #include "csv.hpp" 2 | 3 | int main(int argc, char** argv) { 4 | using namespace csv; 5 | 6 | if (argc < 2) { 7 | std::cout << "Usage: " << argv[0] << " [file]" << std::endl; 8 | exit(1); 9 | } 10 | 11 | std::string filename = argv[1]; 12 | CSVStat stats(filename); 13 | 14 | auto col_names = stats.get_col_names(); 15 | auto min = stats.get_mins(), max = stats.get_maxes(), 16 | means = stats.get_mean(), vars = stats.get_variance(); 17 | 18 | for (size_t i = 0; i < col_names.size(); i++) { 19 | std::cout << col_names[i] << std::endl 20 | << "Min: " << min[i] << std::endl 21 | << "Max: " << max[i] << std::endl 22 | << "Mean: " << means[i] << std::endl 23 | << "Var: " << vars[i] << std::endl; 24 | } 25 | 26 | return 0; 27 | } -------------------------------------------------------------------------------- /single_include_test/file2.cpp: -------------------------------------------------------------------------------- 1 | #include "my_header.hpp" 2 | 3 | int main(int argc, char** argv) { 4 | using namespace csv; 5 | 6 | if (argc < 2) { 7 | std::cout << "Usage: " << argv[0] << " [file]" << std::endl; 8 | exit(1); 9 | } 10 | 11 | std::string filename = argv[1]; 12 | CSVStat stats(filename); 13 | 14 | auto col_names = stats.get_col_names(); 15 | auto min = stats.get_mins(), max = stats.get_maxes(), 16 | means = stats.get_mean(), vars = stats.get_variance(); 17 | 18 | for (size_t i = 0; i < col_names.size(); i++) { 19 | std::cout << col_names[i] << std::endl 20 | << "Min: " << min[i] << std::endl 21 | << "Max: " << max[i] << std::endl 22 | << "Mean: " << means[i] << std::endl 23 | << "Var: " << vars[i] << std::endl; 24 | } 25 | 26 | return 0; 27 | } -------------------------------------------------------------------------------- /CMakeSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "configurations": [ 3 | { 4 | "name": "x64-Release", 5 | "generator": "Ninja", 6 | "configurationType": "RelWithDebInfo", 7 | "inheritEnvironments": [ 8 | "msvc_x64_x64" 9 | ], 10 | "buildRoot": "${projectDir}\\build\\${name}", 11 | "installRoot": "${projectDir}\\install\\${name}", 12 | "cmakeCommandArgs": "", 13 | "buildCommandArgs": "-v", 14 | "ctestCommandArgs": "" 15 | }, 16 | { 17 | "name": "x64-Debug", 18 | "generator": "Ninja", 19 | "configurationType": "Debug", 20 | "inheritEnvironments": [ 21 | "msvc_x64_x64" 22 | ], 23 | "buildRoot": "${projectDir}\\build\\${name}", 24 | "installRoot": "{projectDir}\\install\\${name}", 25 | "cmakeCommandArgs": "", 26 | "buildCommandArgs": "-v", 27 | "ctestCommandArgs": "" 28 | } 29 | ] 30 | } -------------------------------------------------------------------------------- /python/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (EXISTS ${CMAKE_CURRENT_LIST_DIR}/pybind11) 2 | add_subdirectory(pybind11) 3 | 4 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") 5 | set(PYBIND11_CPP_STANDARD /std:c++17) 6 | else() 7 | set(PYBIND11_CPP_STANDARD -std=c++1z) 8 | endif() 9 | 10 | set(CSVPY_SOURCES 11 | ${CMAKE_CURRENT_LIST_DIR}/csvpy.cpp 12 | ) 13 | 14 | pybind11_add_module(csvpy ${CSVPY_SOURCES}) 15 | target_link_libraries(csvpy PUBLIC csv) 16 | 17 | get_property(csvpySuffix TARGET csvpy PROPERTY SUFFIX) 18 | add_custom_command(TARGET csvpy POST_BUILD 19 | COMMAND ${CMAKE_COMMAND} -E copy $ 20 | ${CMAKE_CURRENT_LIST_DIR}/csvpy/csvpy${csvpySuffix}) 21 | else() 22 | message("Cannot build Python bindings because pybind11 submodule was not found. Please run ""git submodule update --recursive"".") 23 | endif() -------------------------------------------------------------------------------- /programs/csv_generator.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "csv.hpp" 7 | 8 | int main(int argc, char** argv) { 9 | using namespace csv; 10 | std::uniform_real_distribution d(1, 1000000); 11 | std::mt19937 gen; 12 | gen.seed(time(0)); 13 | 14 | if (argc < 2) { 15 | std::cout << "Usage: " << argv[0] << " [file]" << std::endl; 16 | exit(1); 17 | } 18 | 19 | std::string file = argv[1]; 20 | std::ofstream outfile(file); 21 | 22 | CSVWriter writer(outfile); 23 | writer << std::vector({"A", "B", "C", "D", "E"}); 24 | 25 | for (size_t i = 0; i < 1000000; i++) { 26 | writer << std::array({ 27 | d(gen), 28 | d(gen), 29 | d(gen), 30 | d(gen), 31 | d(gen) 32 | }); 33 | } 34 | 35 | return 0; 36 | } -------------------------------------------------------------------------------- /python/csvpy/DictReader.py: -------------------------------------------------------------------------------- 1 | from .csvpy import Reader, DataType 2 | 3 | class DictReader: 4 | def __init__(self, filename): 5 | self._reader = Reader(filename) 6 | self._csvIterator = self._reader.__iter__() 7 | 8 | def __iter__(self): 9 | return self 10 | 11 | def __next__(self): 12 | ret = dict() 13 | next_row = self._csvIterator.__next__() 14 | 15 | for col_name in next_row.get_col_names(): 16 | field = next_row[col_name] 17 | field_type = field.type() 18 | value = None 19 | 20 | if field_type == DataType.CSV_STRING: 21 | value = field.get_str() 22 | elif field_type >= DataType.CSV_INT8 and field_type <= DataType.CSV_INT64: 23 | value = field.get_int() 24 | elif field_type == DataType.CSV_DOUBLE: 25 | value = field.get_double() 26 | 27 | ret[col_name] = value 28 | 29 | return ret -------------------------------------------------------------------------------- /include/internal/col_names.cpp: -------------------------------------------------------------------------------- 1 | #include "col_names.hpp" 2 | 3 | namespace csv { 4 | namespace internals { 5 | CSV_INLINE std::vector ColNames::get_col_names() const { 6 | return this->col_names; 7 | } 8 | 9 | CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { 10 | this->col_names = cnames; 11 | 12 | for (size_t i = 0; i < cnames.size(); i++) { 13 | this->col_pos[cnames[i]] = i; 14 | } 15 | } 16 | 17 | CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { 18 | auto pos = this->col_pos.find(col_name.data()); 19 | if (pos != this->col_pos.end()) 20 | return (int)pos->second; 21 | 22 | return CSV_NOT_FOUND; 23 | } 24 | 25 | CSV_INLINE size_t ColNames::size() const noexcept { 26 | return this->col_names.size(); 27 | } 28 | 29 | } 30 | } -------------------------------------------------------------------------------- /tests/test_guess_csv.cpp: -------------------------------------------------------------------------------- 1 | /** @file 2 | * Tests for CSV parsing 3 | */ 4 | 5 | #include // remove() 6 | #include 7 | #include 8 | #include "csv.hpp" 9 | 10 | using namespace csv; 11 | using std::vector; 12 | using std::string; 13 | 14 | // 15 | // guess_delim() 16 | // 17 | TEST_CASE("guess_delim() Test - Pipe", "[test_guess_pipe]") { 18 | CSVGuessResult format = guess_format( 19 | "./tests/data/real_data/2009PowerStatus.txt"); 20 | REQUIRE(format.delim == '|'); 21 | REQUIRE(format.header_row == 0); 22 | } 23 | 24 | TEST_CASE("guess_delim() Test - Semi-Colon", "[test_guess_scolon]") { 25 | CSVGuessResult format = guess_format( 26 | "./tests/data/real_data/YEAR07_CBSA_NAC3.txt"); 27 | REQUIRE(format.delim == ';'); 28 | REQUIRE(format.header_row == 0); 29 | } 30 | 31 | TEST_CASE("guess_delim() Test - CSV with Comments", "[test_guess_comment]") { 32 | CSVGuessResult format = guess_format( 33 | "./tests/data/fake_data/ints_comments.csv"); 34 | REQUIRE(format.delim == ','); 35 | REQUIRE(format.header_row == 5); 36 | } -------------------------------------------------------------------------------- /programs/csv_guess_bench.cpp: -------------------------------------------------------------------------------- 1 | // Calculate benchmarks for CSV guessing 2 | 3 | #include "csv.hpp" 4 | #include 5 | #include 6 | #include 7 | 8 | int main(int argc, char** argv) { 9 | using namespace csv; 10 | 11 | if (argc < 2) { 12 | std::cout << "Usage: " << argv[0] << " [file]" << std::endl; 13 | exit(1); 14 | } 15 | 16 | std::string filename = argv[1]; 17 | std::vector times = {}; 18 | int trials = 5; 19 | 20 | for (int i = 0; i < trials; i++) { 21 | auto start = std::chrono::system_clock::now(); 22 | 23 | // This reads just the first 500 kb of a file 24 | CSVReader reader(filename, CSVFormat::guess_csv()); 25 | 26 | auto end = std::chrono::system_clock::now(); 27 | std::chrono::duration diff = end - start; 28 | times.push_back(diff.count()); 29 | } 30 | 31 | double avg = 0; 32 | for (double time: times) { 33 | avg += time * 1/trials; 34 | } 35 | std::cout << "Guessing took: " << avg << " seconds (averaged over " << trials << " trials)" << std::endl; 36 | 37 | return 0; 38 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017-2019 Vincent La 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /include/internal/col_names.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "common.hpp" 8 | 9 | namespace csv { 10 | namespace internals { 11 | struct ColNames; 12 | using ColNamesPtr = std::shared_ptr; 13 | 14 | /** @struct ColNames 15 | * A data structure for handling column name information. 16 | * 17 | * These are created by CSVReader and passed (via smart pointer) 18 | * to CSVRow objects it creates, thus 19 | * allowing for indexing by column name. 20 | */ 21 | struct ColNames { 22 | public: 23 | ColNames() = default; 24 | ColNames(const std::vector& names) { 25 | set_col_names(names); 26 | } 27 | 28 | std::vector get_col_names() const; 29 | void set_col_names(const std::vector&); 30 | int index_of(csv::string_view) const; 31 | 32 | bool empty() const noexcept { return this->col_names.empty(); } 33 | size_t size() const noexcept; 34 | 35 | private: 36 | std::vector col_names; 37 | std::unordered_map col_pos; 38 | }; 39 | } 40 | } -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | 3 | FetchContent_Declare( 4 | Catch2 5 | GIT_REPOSITORY https://github.com/catchorg/Catch2.git 6 | GIT_TAG v3.6.0 7 | ) 8 | 9 | FetchContent_MakeAvailable(Catch2) 10 | 11 | add_executable(csv_test "") 12 | target_sources(csv_test 13 | PRIVATE 14 | ${CSV_INCLUDE_DIR}/csv.hpp 15 | main.cpp 16 | test_csv_field.cpp 17 | test_csv_field_array.cpp 18 | test_csv_format.cpp 19 | test_csv_iterator.cpp 20 | test_csv_row.cpp 21 | test_csv_row_json.cpp 22 | test_csv_stat.cpp 23 | test_guess_csv.cpp 24 | test_read_csv.cpp 25 | test_read_csv_file.cpp 26 | test_write_csv.cpp 27 | test_data_type.cpp 28 | test_raw_csv_data.cpp 29 | test_round_trip.cpp 30 | test_csv_delimeter.cpp 31 | test_csv_ranges.cpp 32 | ) 33 | target_link_libraries(csv_test csv) 34 | target_link_libraries(csv_test Catch2::Catch2WithMain) 35 | 36 | if(MSVC) 37 | # Workaround to enable debugging unit tests in Visual Studio 38 | add_custom_command( 39 | TARGET csv_test POST_BUILD 40 | COMMAND ${CMAKE_COMMAND} -E copy_directory 41 | ${CSV_TEST_DIR}/data $/tests/data 42 | ) 43 | endif() 44 | 45 | add_test( 46 | NAME test 47 | COMMAND csv_test 48 | WORKING_DIRECTORY ${CSV_ROOT_DIR} 49 | ) -------------------------------------------------------------------------------- /tests/shared/float_test_cases.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using std::make_tuple; 5 | 6 | namespace csv_test { 7 | static const std::initializer_list> FLOAT_TEST_CASES = { 8 | make_tuple("3.14", 3.14L), 9 | make_tuple("+3.14", 3.14L), 10 | make_tuple(" -3.14 ", -3.14L), 11 | make_tuple("2.71828", 2.71828L), 12 | 13 | // Test uniform distribution values 14 | make_tuple("0.12", 0.12L), 15 | make_tuple("0.334", 0.334L), 16 | make_tuple("0.625", 0.625L), 17 | make_tuple("0.666666", 0.666666L), 18 | make_tuple("0.69", 0.69L), 19 | 20 | // Test negative values between 0 and 1 21 | make_tuple("-0.12", -0.12L), 22 | make_tuple("-0.334", -0.334L), 23 | make_tuple("-0.625", -0.625L), 24 | make_tuple("-0.666666", -0.666666L), 25 | make_tuple("-0.69", -0.69L), 26 | 27 | // Larger numbers 28 | make_tuple("1000.00", 1000L), 29 | make_tuple("1000000.00", 1000000L), 30 | make_tuple("9999999.99", 9999999.99L), 31 | make_tuple("99999999.999", 99999999.999L), 32 | 33 | make_tuple("-1000.00", -1000L), 34 | make_tuple("-1000000.00", -1000000L), 35 | make_tuple("-9999999.99", -9999999.99L), 36 | make_tuple("-99999999.999", -99999999.999L), 37 | }; 38 | } -------------------------------------------------------------------------------- /programs/csv_bench.cpp: -------------------------------------------------------------------------------- 1 | // Calculate benchmarks for CSV parser 2 | 3 | #include "csv.hpp" 4 | #include 5 | #include 6 | #include 7 | 8 | int main(int argc, char** argv) { 9 | using namespace csv; 10 | 11 | if (argc < 2) { 12 | std::cout << "Usage: " << argv[0] << " [file]" << std::endl; 13 | exit(1); 14 | } 15 | 16 | // Benchmark 1: File IO + Parsing 17 | std::string filename = argv[1]; 18 | auto start = std::chrono::system_clock::now(); 19 | auto info = get_file_info(filename); 20 | auto end = std::chrono::system_clock::now(); 21 | std::chrono::duration diff = end - start; 22 | 23 | std::cout << "Parsing took (including disk IO): " << diff.count() << std::endl; 24 | std::cout << "Dimensions: " << info.n_rows << " rows x " << info.n_cols << " columns " << std::endl; 25 | std::cout << "Columns: "; 26 | for (auto& col : info.col_names) { 27 | std::cout << " " << col; 28 | } 29 | std::cout << std::endl; 30 | 31 | // Benchmark 2: Parsing Only 32 | /* 33 | std::ifstream csv(filename); 34 | std::stringstream buffer; 35 | buffer << csv.rdbuf(); 36 | 37 | auto csv_str = buffer.str(); 38 | 39 | start = std::chrono::system_clock::now(); 40 | parse(csv_str); 41 | end = std::chrono::system_clock::now(); 42 | diff = end - start; 43 | 44 | std::cout << "Parsing took: " << diff.count() << std::endl; 45 | */ 46 | 47 | return 0; 48 | } -------------------------------------------------------------------------------- /include/internal/csv_utility.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "common.hpp" 3 | #include "csv_format.hpp" 4 | #include "csv_reader.hpp" 5 | #include "data_type.hpp" 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | namespace csv { 12 | /** Returned by get_file_info() */ 13 | struct CSVFileInfo { 14 | std::string filename; /**< Filename */ 15 | std::vector col_names; /**< CSV column names */ 16 | char delim; /**< Delimiting character */ 17 | size_t n_rows; /**< Number of rows in a file */ 18 | size_t n_cols; /**< Number of columns in a CSV */ 19 | }; 20 | 21 | /** @name Shorthand Parsing Functions 22 | * @brief Convienience functions for parsing small strings 23 | */ 24 | ///@{ 25 | CSVReader operator ""_csv(const char*, size_t); 26 | CSVReader operator ""_csv_no_header(const char*, size_t); 27 | CSVReader parse(csv::string_view in, CSVFormat format = CSVFormat()); 28 | CSVReader parse_no_header(csv::string_view in); 29 | ///@} 30 | 31 | /** @name Utility Functions */ 32 | ///@{ 33 | std::unordered_map csv_data_types(const std::string&); 34 | CSVFileInfo get_file_info(const std::string& filename); 35 | int get_col_pos(csv::string_view filename, csv::string_view col_name, 36 | const CSVFormat& format = CSVFormat::guess_csv()); 37 | ///@} 38 | } -------------------------------------------------------------------------------- /include/csv.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | CSV for C++, version 2.3.0 3 | https://github.com/vincentlaucsb/csv-parser 4 | 5 | MIT License 6 | 7 | Copyright (c) 2017-2024 Vincent La 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy 10 | of this software and associated documentation files (the "Software"), to deal 11 | in the Software without restriction, including without limitation the rights 12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | copies of the Software, and to permit persons to whom the Software is 14 | furnished to do so, subject to the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be included in all 17 | copies or substantial portions of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | */ 27 | 28 | #pragma once 29 | #ifndef CSV_HPP 30 | #define CSV_HPP 31 | 32 | #include "internal/csv_reader.hpp" 33 | #include "internal/csv_stat.hpp" 34 | #include "internal/csv_utility.hpp" 35 | #include "internal/csv_writer.hpp" 36 | 37 | /** INSERT_CSV_SOURCES **/ 38 | 39 | #endif -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Custom Settings 2 | CMakeLists2.txt 3 | 4 | # Build 5 | bin/ 6 | build/ 7 | 8 | # Build: Python 9 | *.pyc 10 | *.pyd 11 | 12 | # Doxygen 13 | docs/html 14 | *.tmp 15 | 16 | # Visual Studio 17 | .vs/ 18 | *.pdb 19 | *.i* 20 | *.*log 21 | 22 | # Compiled Object files 23 | *.slo 24 | *.lo 25 | *.o 26 | *.obj 27 | 28 | # Precompiled Headers 29 | *.gch 30 | *.pch 31 | 32 | # Compiled Dynamic libraries 33 | *.so 34 | *.dylib 35 | *.dll 36 | 37 | # Fortran module files 38 | *.mod 39 | 40 | # Compiled Static libraries 41 | *.lai 42 | *.la 43 | *.a 44 | *.lib 45 | 46 | # Executables 47 | *.exe 48 | *.out 49 | *.app 50 | 51 | # Test outputs 52 | *.gcda 53 | *.gcno 54 | *.gcov 55 | 56 | # ========================= 57 | # Operating System Files 58 | # ========================= 59 | 60 | # OSX 61 | # ========================= 62 | 63 | .DS_Store 64 | .AppleDouble 65 | .LSOverride 66 | 67 | # Thumbnails 68 | ._* 69 | 70 | # Files that might appear in the root of a volume 71 | .DocumentRevisions-V100 72 | .fseventsd 73 | .Spotlight-V100 74 | .TemporaryItems 75 | .Trashes 76 | .VolumeIcon.icns 77 | 78 | # Directories potentially created on remote AFP share 79 | .AppleDB 80 | .AppleDesktop 81 | Network Trash Folder 82 | Temporary Items 83 | .apdisk 84 | 85 | # Windows 86 | # ========================= 87 | 88 | # Windows image file caches 89 | Thumbs.db 90 | ehthumbs.db 91 | 92 | # Folder config file 93 | Desktop.ini 94 | 95 | # Recycle Bin used on file shares 96 | $RECYCLE.BIN/ 97 | 98 | # Windows Installer files 99 | *.cab 100 | *.msi 101 | *.msm 102 | *.msp 103 | 104 | # Windows shortcuts 105 | *.lnk 106 | -------------------------------------------------------------------------------- /programs/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(csv_info ${CMAKE_CURRENT_LIST_DIR}/csv_info.cpp) 2 | target_link_libraries(csv_info csv) 3 | 4 | add_executable(csv_stats ${CMAKE_CURRENT_LIST_DIR}/csv_stats.cpp) 5 | target_link_libraries(csv_stats csv) 6 | 7 | # Provide rudimentary benchmarks 8 | if(CSV_DEVELOPER) 9 | add_executable(csv_guess_bench ${CMAKE_CURRENT_LIST_DIR}/csv_guess_bench.cpp) 10 | target_link_libraries(csv_guess_bench csv) 11 | 12 | # Benchmarks for parsing speed 13 | add_executable(csv_bench ${CMAKE_CURRENT_LIST_DIR}/csv_bench.cpp) 14 | target_link_libraries(csv_bench csv) 15 | 16 | add_custom_target(generate_csv_bench 17 | COMMAND csv_bench 2015_StateDepartment.csv 18 | WORKING_DIRECTORY ${CSV_TEST_DIR}/data/real_data 19 | ) 20 | 21 | # Don't compile programs if required headers are not found 22 | include(CheckCXXSourceCompiles) 23 | check_cxx_source_compiles(" 24 | #include 25 | 26 | int main(int argc, char** argv) { 27 | return 0; 28 | } 29 | " haveCharconv) 30 | 31 | check_cxx_source_compiles(" 32 | #include 33 | 34 | int main(int argc, char** argv) { 35 | const char* str = \"123.456\"; 36 | long double d; 37 | std::from_chars(str, str + 7, d); 38 | return 0; 39 | } 40 | " FROM_CHARS_SUPPORT_DOUBLE) 41 | 42 | if(haveCharconv) 43 | add_executable(csv_generator ${CMAKE_CURRENT_LIST_DIR}/csv_generator.cpp) 44 | target_link_libraries(csv_generator csv) 45 | 46 | # Benchmarks for data_type() function 47 | if(FROM_CHARS_SUPPORT_DOUBLE) 48 | add_definitions(-DFROM_CHARS_SUPPORT_DOUBLE) 49 | endif() 50 | add_executable(data_type_bench ${CMAKE_CURRENT_LIST_DIR}/data_type_bench.cpp) 51 | target_link_libraries(data_type_bench csv) 52 | 53 | add_custom_target(generate_dtype_bench 54 | COMMAND data_type_bench 2015_StateDepartment.csv "Regular Pay" 55 | WORKING_DIRECTORY ${CSV_TEST_DIR}/data/real_data) 56 | endif() 57 | endif() -------------------------------------------------------------------------------- /tests/test_csv_ranges.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #ifdef CSV_HAS_CXX20 5 | #include 6 | 7 | TEST_CASE("CSVReader C++20 Ranges Compatibility", "[ranges][cxx20]") { 8 | SECTION("CSVReader works with std::ranges::distance") { 9 | std::stringstream ss("A,B,C\n1,2,3\n4,5,6\n7,8,9"); 10 | csv::CSVReader reader(ss); 11 | 12 | auto count = std::ranges::distance(reader); 13 | REQUIRE(count == 3); 14 | } 15 | 16 | SECTION("CSVReader works with std::views") { 17 | std::stringstream ss("A,B,C\n1,2,3\n4,5,6\n7,8,9\n10,11,12"); 18 | csv::CSVReader reader(ss); 19 | 20 | auto filtered = reader | 21 | std::views::filter([](const csv::CSVRow &row) { 22 | return !row.empty() && row[0].get() > 5; 23 | }); 24 | 25 | int filtered_count = 0; 26 | for (const auto &row : filtered) { 27 | filtered_count++; 28 | int val = row[0].get(); 29 | REQUIRE(val > 5); 30 | } 31 | REQUIRE(filtered_count == 2); // rows with 7 and 10 32 | } 33 | 34 | SECTION("CSVReader iterator satisfies input_range requirements") { 35 | std::stringstream ss("A,B\n1,2\n3,4"); 36 | csv::CSVReader reader(ss); 37 | 38 | auto it = reader.begin(); 39 | auto end = reader.end(); 40 | 41 | static_assert(std::input_iterator); 42 | static_assert(std::ranges::range); 43 | static_assert(std::ranges::input_range); 44 | static_assert(std::sentinel_for); 45 | 46 | REQUIRE(it != end); 47 | auto row = *it; 48 | REQUIRE(row.size() == 2); 49 | 50 | ++it; 51 | REQUIRE(it != end); 52 | 53 | ++it; 54 | REQUIRE(it == end); 55 | } 56 | } 57 | #endif 58 | -------------------------------------------------------------------------------- /tests/test_round_trip.cpp: -------------------------------------------------------------------------------- 1 | /** Tests of both reading and writing */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "csv.hpp" 9 | 10 | using namespace csv; 11 | 12 | TEST_CASE("Simple Buffered Integer Round Trip Test", "[test_roundtrip_int]") { 13 | auto filename = "round_trip.csv"; 14 | std::ofstream outfile(filename, std::ios::binary); 15 | auto writer = make_csv_writer_buffered(outfile); 16 | 17 | writer << std::vector({"A", "B", "C", "D", "E"}); 18 | 19 | const size_t n_rows = 1000000; 20 | 21 | for (size_t i = 0; i < n_rows; i++) { 22 | auto str = internals::to_string(i); 23 | writer << std::array({str, str, str, str, str}); 24 | } 25 | writer.flush(); 26 | 27 | CSVReader reader(filename); 28 | 29 | size_t i = 0; 30 | for (auto &row : reader) { 31 | for (auto &col : row) { 32 | REQUIRE(col == i); 33 | } 34 | 35 | i++; 36 | } 37 | 38 | REQUIRE(reader.n_rows() == n_rows); 39 | 40 | remove(filename); 41 | } 42 | 43 | TEST_CASE("Simple Integer Round Trip Test", "[test_roundtrip_int]") { 44 | auto filename = "round_trip.csv"; 45 | std::ofstream outfile(filename, std::ios::binary); 46 | auto writer = make_csv_writer(outfile); 47 | 48 | writer << std::vector({ "A", "B", "C", "D", "E" }); 49 | 50 | const size_t n_rows = 1000000; 51 | 52 | for (size_t i = 0; i < n_rows; i++) { 53 | auto str = internals::to_string(i); 54 | writer << std::array({ str, str, str, str, str }); 55 | } 56 | 57 | CSVReader reader(filename); 58 | 59 | size_t i = 0; 60 | for (auto& row : reader) { 61 | for (auto& col : row) { 62 | REQUIRE(col == i); 63 | } 64 | 65 | i++; 66 | } 67 | 68 | REQUIRE(reader.n_rows() == n_rows); 69 | 70 | remove(filename); 71 | } -------------------------------------------------------------------------------- /tests/test_csv_format.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "csv.hpp" 3 | using namespace csv; 4 | 5 | static std::string err_preamble = "There should be no overlap between " 6 | "the quote character, the set of possible " 7 | "delimiters and the set of whitespace characters."; 8 | 9 | // Assert that an error is thrown if whitespace, delimiter, and quote 10 | TEST_CASE("CSVFormat - Overlapping Characters", "[csv_format_overlap]") { 11 | CSVFormat format; 12 | bool err_caught = false; 13 | 14 | SECTION("Tab") { 15 | try { 16 | format.delimiter('\t').quote('"').trim({ '\t' }); 17 | } 18 | catch (std::runtime_error& err) { 19 | err_caught = true; 20 | REQUIRE(err.what() == std::string(err_preamble + " Offending characters: '\t'.")); 21 | } 22 | 23 | REQUIRE(err_caught); 24 | } 25 | 26 | SECTION("Tab with multiple other characters") { 27 | try { 28 | format.delimiter({ ',', '\t' }).quote('"').trim({ ' ', '\t' }); 29 | } 30 | catch (std::runtime_error& err) { 31 | err_caught = true; 32 | REQUIRE(err.what() == std::string(err_preamble + " Offending characters: '\t'.")); 33 | } 34 | 35 | REQUIRE(err_caught); 36 | } 37 | 38 | SECTION("Repeated quote") { 39 | try { 40 | format.delimiter({ ',', '"' }).quote('"').trim({ ' ', '\t' }); 41 | } 42 | catch (std::runtime_error& err) { 43 | err_caught = true; 44 | REQUIRE(err.what() == std::string(err_preamble + " Offending characters: '\"'.")); 45 | } 46 | 47 | REQUIRE(err_caught); 48 | } 49 | 50 | SECTION("Multiple offenders") { 51 | try { 52 | format.delimiter({ ',', '\t', ' ' }).quote('"').trim({ ' ', '\t' }); 53 | } 54 | catch (std::runtime_error& err) { 55 | err_caught = true; 56 | REQUIRE(err.what() == std::string(err_preamble + " Offending characters: '\t', ' '.")); 57 | } 58 | 59 | REQUIRE(err_caught); 60 | } 61 | } -------------------------------------------------------------------------------- /include/internal/csv_stat.hpp: -------------------------------------------------------------------------------- 1 | /** @file 2 | * Calculates statistics from CSV files 3 | */ 4 | 5 | #pragma once 6 | #include 7 | #include 8 | #include 9 | #include "csv_reader.hpp" 10 | 11 | namespace csv { 12 | /** Class for calculating statistics from CSV files and in-memory sources 13 | * 14 | * **Example** 15 | * \include programs/csv_stats.cpp 16 | * 17 | */ 18 | class CSVStat { 19 | public: 20 | using FreqCount = std::unordered_map; 21 | using TypeCount = std::unordered_map; 22 | 23 | std::vector get_mean() const; 24 | std::vector get_variance() const; 25 | std::vector get_mins() const; 26 | std::vector get_maxes() const; 27 | std::vector get_counts() const; 28 | std::vector get_dtypes() const; 29 | 30 | std::vector get_col_names() const { 31 | return this->reader.get_col_names(); 32 | } 33 | 34 | CSVStat(csv::string_view filename, CSVFormat format = CSVFormat::guess_csv()); 35 | CSVStat(std::stringstream& source, CSVFormat format = CSVFormat()); 36 | private: 37 | // An array of rolling averages 38 | // Each index corresponds to the rolling mean for the column at said index 39 | std::vector rolling_means; 40 | std::vector rolling_vars; 41 | std::vector mins; 42 | std::vector maxes; 43 | std::vector counts; 44 | std::vector dtypes; 45 | std::vector n; 46 | 47 | // Statistic calculators 48 | void variance(const long double&, const size_t&); 49 | void count(CSVField&, const size_t&); 50 | void min_max(const long double&, const size_t&); 51 | void dtype(CSVField&, const size_t&); 52 | 53 | void calc(); 54 | void calc_chunk(); 55 | void calc_worker(const size_t&); 56 | 57 | CSVReader reader; 58 | std::deque records = {}; 59 | }; 60 | } -------------------------------------------------------------------------------- /include/internal/csv_reader_iterator.cpp: -------------------------------------------------------------------------------- 1 | /** @file 2 | * Defines an input iterator for csv::CSVReader 3 | */ 4 | 5 | #include "csv_reader.hpp" 6 | 7 | namespace csv { 8 | /** Return an iterator to the first row in the reader */ 9 | CSV_INLINE CSVReader::iterator CSVReader::begin() { 10 | if (this->records->empty()) { 11 | this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); 12 | this->read_csv_worker.join(); 13 | 14 | // Still empty => return end iterator 15 | if (this->records->empty()) return this->end(); 16 | } 17 | 18 | this->_n_rows++; 19 | CSVReader::iterator ret(this, this->records->pop_front()); 20 | return ret; 21 | } 22 | 23 | /** A placeholder for the imaginary past the end row in a CSV. 24 | * Attempting to deference this will lead to bad things. 25 | */ 26 | CSV_INLINE HEDLEY_CONST CSVReader::iterator CSVReader::end() const noexcept { 27 | return CSVReader::iterator(); 28 | } 29 | 30 | ///////////////////////// 31 | // CSVReader::iterator // 32 | ///////////////////////// 33 | 34 | CSV_INLINE CSVReader::iterator::iterator(CSVReader* _daddy, CSVRow&& _row) : 35 | daddy(_daddy) { 36 | row = std::move(_row); 37 | } 38 | 39 | /** Advance the iterator by one row. If this CSVReader has an 40 | * associated file, then the iterator will lazily pull more data from 41 | * that file until the end of file is reached. 42 | * 43 | * @note This iterator does **not** block the thread responsible for parsing CSV. 44 | * 45 | */ 46 | CSV_INLINE CSVReader::iterator& CSVReader::iterator::operator++() { 47 | if (!daddy->read_row(this->row)) { 48 | this->daddy = nullptr; // this == end() 49 | } 50 | 51 | return *this; 52 | } 53 | 54 | /** Post-increment iterator */ 55 | CSV_INLINE CSVReader::iterator CSVReader::iterator::operator++(int) { 56 | auto temp = *this; 57 | if (!daddy->read_row(this->row)) { 58 | this->daddy = nullptr; // this == end() 59 | } 60 | 61 | return temp; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /programs/data_type_bench.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "csv.hpp" 5 | #ifndef NDEBUG 6 | #define NDEBUG 7 | #endif 8 | 9 | long double get_max(std::string file, std::string column, bool use_std = false); 10 | 11 | long double get_max(std::string file, std::string column, bool use_std) { 12 | using namespace csv; 13 | long double max = -std::numeric_limits::infinity(); 14 | CSVReader reader(file); 15 | 16 | for (auto& row : reader) { 17 | auto field = row[column]; 18 | long double out = 0; 19 | 20 | if (use_std) { 21 | auto _field = field.get(); 22 | #ifdef FROM_CHARS_SUPPORT_DOUBLE 23 | auto data = _field.data(); 24 | std::from_chars( 25 | data, data + _field.size(), 26 | out 27 | ); 28 | #else 29 | std::string str(_field); 30 | std::stringstream ss(str); 31 | ss >> out; 32 | #endif 33 | } 34 | else { 35 | out = field.get(); 36 | } 37 | 38 | if (out > max) { 39 | max = out; 40 | } 41 | } 42 | 43 | return max; 44 | } 45 | 46 | int main(int argc, char** argv) { 47 | using namespace csv; 48 | 49 | if (argc < 3) { 50 | std::cout << "Usage: " << argv[0] << " [file] [column]" << std::endl; 51 | exit(1); 52 | } 53 | 54 | std::string file = argv[1], 55 | column = argv[2]; 56 | 57 | long double max = 0, std_avg = 0, csv_avg = 0; 58 | const long double trials = 5; 59 | 60 | 61 | for (size_t i = 0; i < trials; i++) { 62 | auto start = std::chrono::system_clock::now(); 63 | max = get_max(file, column, true); 64 | auto end = std::chrono::system_clock::now(); 65 | std::chrono::duration diff = end - start; 66 | std_avg += diff.count() / trials; 67 | 68 | start = std::chrono::system_clock::now(); 69 | max = get_max(file, column, false); 70 | end = std::chrono::system_clock::now(); 71 | diff = end - start; 72 | csv_avg += diff.count() / trials; 73 | } 74 | 75 | std::cout << "std::from_chars: " << std_avg << std::endl; 76 | std::cout << "csv::data_type: " << csv_avg << std::endl; 77 | std::cout << "Maximum value: " << max << std::endl; 78 | 79 | return 0; 80 | } -------------------------------------------------------------------------------- /tests/test_csv_row.cpp: -------------------------------------------------------------------------------- 1 | // Tests for the CSVRow and CSVField Data Structures 2 | 3 | #include 4 | #include "csv.hpp" 5 | using namespace csv; 6 | 7 | // Construct a CSVRow and assert that its interface works as expected 8 | TEST_CASE("CSVRow Test", "[test_csv_row]") { 9 | auto reader = "A,B,C,D\r\n" 10 | "Col1,Col2,Col3,Col4"_csv; 11 | 12 | CSVRow row; 13 | reader.read_row(row); 14 | 15 | bool error_caught = false; 16 | 17 | SECTION("size() Check") { 18 | REQUIRE(row.size() == 4); 19 | } 20 | 21 | SECTION("operator[]") { 22 | REQUIRE(row[1] == "Col2"); 23 | REQUIRE(row["B"] == "Col2"); 24 | 25 | REQUIRE(row[2] == "Col3"); 26 | REQUIRE(row["C"] == "Col3"); 27 | } 28 | 29 | SECTION("operator[] Out of Bounds") { 30 | try { 31 | row[4].get<>(); 32 | } 33 | catch (std::runtime_error&) { 34 | error_caught = true; 35 | } 36 | 37 | REQUIRE(error_caught); 38 | } 39 | 40 | SECTION("operator[] Access Non-Existent Column") { 41 | try { 42 | row["Col5"].get<>(); 43 | } 44 | catch (std::runtime_error&) { 45 | error_caught = true; 46 | } 47 | 48 | REQUIRE(error_caught); 49 | } 50 | 51 | SECTION("Content Check") { 52 | REQUIRE(std::vector(row) == 53 | std::vector({ "Col1", "Col2", "Col3", "Col4" })); 54 | } 55 | 56 | /** Allow get_sv() to be used with a const CSVField 57 | * 58 | * See: https://github.com/vincentlaucsb/csv-parser/issues/86 59 | * 60 | */ 61 | SECTION("get_sv() Check") { 62 | std::vector content; 63 | 64 | for (const auto& field : row) { 65 | content.push_back(std::string(field.get_sv())); 66 | } 67 | 68 | REQUIRE(std::vector(row) == 69 | std::vector({ "Col1", "Col2", "Col3", "Col4" })); 70 | } 71 | } 72 | 73 | // Integration test for CSVRow/CSVField 74 | TEST_CASE("CSVField operator==", "[test_csv_field_equal]") { 75 | auto reader = "A,B,C,D\r\n" 76 | "1,2,3,3.14"_csv; 77 | 78 | CSVRow row; 79 | reader.read_row(row); 80 | 81 | REQUIRE(row["A"] == 1); 82 | REQUIRE(row["B"] == 2); 83 | REQUIRE(row["C"] == 3); 84 | REQUIRE(internals::is_equal(row["D"].get(), 3.14L)); 85 | } -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile used for building/testing on Travis CI 2 | 3 | # Force Travis to use updated compilers 4 | ifeq ($(TRAVIS_COMPILER), gcc) 5 | CXX = g++-8 6 | else ifeq ($(TRAVIS_COMPILER), clang) 7 | CXX = clang++ 8 | endif 9 | 10 | ifeq ($(STD), ) 11 | STD = c++11 12 | endif 13 | 14 | BUILD_DIR = build 15 | SOURCE_DIR = include 16 | SINGLE_INCLUDE_DIR = single_include 17 | TEST_DIR = tests 18 | CFLAGS = -pthread -std=$(STD) 19 | 20 | TEST_OFLAGS = 21 | ifeq ($(CXX), g++-8) 22 | TEST_OFLAGS = -Og 23 | endif 24 | 25 | TEST_FLAGS = -Itests/ $(CFLAGS) $(TEST_OFLAGS) -g --coverage -Wno-unknown-pragmas -Wall 26 | 27 | # Main Library 28 | SOURCES = $(wildcard include/internal/*.cpp) 29 | OBJECTS = $(subst .cpp,.o,$(subst src/,$(BUILD_DIR)/,$(SOURCES))) 30 | 31 | TEST_SOURCES = $(wildcard tests/*.cpp) 32 | TEST_SOURCES_NO_EXT = $(subst tests/,,$(subst .cpp,,$(TEST_SOURCES))) 33 | 34 | all: csv_parser test_all clean distclean 35 | 36 | ################ 37 | # Main Library # 38 | ################ 39 | csv: 40 | $(CXX) -c -O3 $(CFLAGS) $(SOURCES) 41 | mkdir -p $(BUILD_DIR) 42 | mv *.o $(BUILD_DIR) 43 | 44 | libcsv.a: 45 | make csv 46 | ar rvs libcsv.a $(wildcard build/*.o) 47 | 48 | docs: 49 | doxygen Doxyfile 50 | 51 | ############ 52 | # Programs # 53 | ############ 54 | csv_stats: 55 | $(CXX) -o csv_stats -O3 $(CFLAGS) programs/csv_stats.cpp -I$(SINGLE_INCLUDE_DIR) 56 | 57 | ######### 58 | # Tests # 59 | ######### 60 | csv_test: 61 | $(CXX) -o csv_test $(SOURCES) $(TEST_SOURCES) -I${SOURCE_DIR} $(TEST_FLAGS) 62 | 63 | run_csv_test: csv_test 64 | mkdir -p tests/temp 65 | ./csv_test 66 | 67 | # Test Clean-Up 68 | rm -rf $(TEST_DIR)/temp 69 | 70 | # Run code coverage analysis 71 | code_cov: csv_test 72 | mkdir -p test_results 73 | mv *.gcno *.gcda $(PWD)/test_results 74 | gcov-8 $(SOURCES) -o test_results --relative-only 75 | mv *.gcov test_results 76 | 77 | # Generate report 78 | code_cov_report: 79 | cd test_results 80 | lcov --capture --directory test_results --output-file coverage.info 81 | genhtml coverage.info --output-directory out 82 | 83 | valgrind: csv_stats 84 | # Can't run valgrind against csv_test because it mangles the working directory 85 | # which causes csv_test to not be able to find test files 86 | valgrind --leak-check=full ./csv_stats $(TEST_DIR)/data/real_data/2016_Gaz_place_national.txt 87 | 88 | .PHONY: all clean distclean 89 | 90 | clean: 91 | rm -f build/* 92 | rm -f *.gc* 93 | rm -f libcsv.a 94 | rm -f csv_* 95 | 96 | distclean: clean -------------------------------------------------------------------------------- /include/internal/csv_utility.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "csv_utility.hpp" 5 | 6 | namespace csv { 7 | /** Shorthand function for parsing an in-memory CSV string 8 | * 9 | * @return A collection of CSVRow objects 10 | * 11 | * @par Example 12 | * @snippet tests/test_read_csv.cpp Parse Example 13 | */ 14 | CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { 15 | std::stringstream stream(std::string(in.data(), in.length())); 16 | return CSVReader(stream, format); 17 | } 18 | 19 | /** Parses a CSV string with no headers 20 | * 21 | * @return A collection of CSVRow objects 22 | */ 23 | CSV_INLINE CSVReader parse_no_header(csv::string_view in) { 24 | CSVFormat format; 25 | format.header_row(-1); 26 | 27 | return parse(in, format); 28 | } 29 | 30 | /** Parse a RFC 4180 CSV string, returning a collection 31 | * of CSVRow objects 32 | * 33 | * @par Example 34 | * @snippet tests/test_read_csv.cpp Escaped Comma 35 | * 36 | */ 37 | CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) { 38 | return parse(csv::string_view(in, n)); 39 | } 40 | 41 | /** A shorthand for csv::parse_no_header() */ 42 | CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { 43 | return parse_no_header(csv::string_view(in, n)); 44 | } 45 | 46 | /** 47 | * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise 48 | * 49 | * @param[in] filename Path to CSV file 50 | * @param[in] col_name Column whose position we should resolve 51 | * @param[in] format Format of the CSV file 52 | */ 53 | CSV_INLINE int get_col_pos( 54 | csv::string_view filename, 55 | csv::string_view col_name, 56 | const CSVFormat& format) { 57 | CSVReader reader(filename, format); 58 | return reader.index_of(col_name); 59 | } 60 | 61 | /** Get basic information about a CSV file 62 | * @include programs/csv_info.cpp 63 | */ 64 | CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) { 65 | CSVReader reader(filename); 66 | CSVFormat format = reader.get_format(); 67 | for (auto it = reader.begin(); it != reader.end(); ++it); 68 | 69 | CSVFileInfo info = { 70 | filename, 71 | reader.get_col_names(), 72 | format.get_delim(), 73 | reader.n_rows(), 74 | reader.get_col_names().size() 75 | }; 76 | 77 | return info; 78 | } 79 | } -------------------------------------------------------------------------------- /tests/test_csv_field_array.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "csv.hpp" 5 | 6 | using namespace csv; 7 | using namespace csv::internals; 8 | 9 | TEST_CASE("Test Dynamic RawCSVFieldArray - Emplace Back", "[test_dynamic_array_emplace]") { 10 | using namespace csv::internals; 11 | 12 | constexpr size_t offset = 100; 13 | 14 | // Array size should be smaller than the number of items we want to push 15 | CSVFieldList arr(500); 16 | 17 | for (size_t i = 0; i < 9999; i++) { 18 | arr.emplace_back(i, i + offset); 19 | 20 | // Check operator[] as field was just populated 21 | REQUIRE(arr[i].start == i); 22 | REQUIRE(arr[i].length == i + offset); 23 | 24 | REQUIRE(arr.size() == i + 1); 25 | } 26 | 27 | for (size_t i = 0; i < 9999; i++) { 28 | // Check for potential data corruption 29 | REQUIRE(arr[i].start == i); 30 | REQUIRE(arr[i].length == i + offset); 31 | } 32 | } 33 | 34 | TEST_CASE("Test CSVFieldArray Thread Safety", "[test_array_thread]") { 35 | constexpr size_t offset = 100; 36 | 37 | // Array size should be smaller than the number of items we want to push 38 | CSVFieldList arr(500); 39 | 40 | for (size_t i = 0; i < 9999; i++) { 41 | arr.emplace_back(i, i + offset); 42 | 43 | // Check operator[] as field was just populated 44 | REQUIRE(arr[i].start == i); 45 | REQUIRE(arr[i].length == i + offset); 46 | 47 | REQUIRE(arr.size() == i + 1); 48 | } 49 | 50 | // Check contents from another thread 51 | constexpr size_t num_workers = 4; 52 | constexpr size_t chunk_size = 9999 / num_workers; 53 | std::vector> workers = {}; 54 | 55 | for (size_t i = 0; i < num_workers; i++) { 56 | size_t start = i * chunk_size; 57 | size_t end = start + chunk_size; 58 | 59 | workers.push_back( 60 | std::async([](const CSVFieldList& arr, size_t start, size_t end, size_t offset) { 61 | for (size_t i = start; i < end; i++) { 62 | if (arr[i].start != i || arr[i].length != i + offset) 63 | return false; 64 | } 65 | 66 | return true; 67 | }, std::ref(arr), start, end, offset) 68 | ); 69 | } 70 | 71 | // Writer from another thread 72 | for (size_t i = 9999; i < 19999; i++) { 73 | arr.emplace_back(i, i + offset); 74 | 75 | // Check operator[] as field was just populated 76 | REQUIRE(arr[i].start == i); 77 | REQUIRE(arr[i].length == i + offset); 78 | 79 | REQUIRE(arr.size() == i + 1); 80 | } 81 | 82 | for (auto& result : workers) { 83 | REQUIRE(result.get() == true); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: 2 | - cpp 3 | matrix: 4 | include: 5 | - os: linux 6 | env: STD=c++11 CSV_CXX_STANDARD=11 CXX_COMPILER=g++-9 C_COMPILER=gcc-9 7 | compiler: gcc 8 | addons: 9 | apt: 10 | sources: ['ubuntu-toolchain-r-test'] 11 | packages: ['g++-9', 'cmake', 'valgrind', 'doxygen'] 12 | - os: linux 13 | env: STD=c++14 CSV_CXX_STANDARD=14 CXX_COMPILER=g++-9 C_COMPILER=gcc-9 14 | compiler: gcc 15 | addons: 16 | apt: 17 | sources: ['ubuntu-toolchain-r-test'] 18 | packages: ['g++-9', 'cmake', 'doxygen'] 19 | - os: linux 20 | env: STD=c++17 CSV_CXX_STANDARD=17 MAIN_BUILD=true CXX_COMPILER=g++-9 C_COMPILER=gcc-9 21 | compiler: gcc 22 | addons: 23 | apt: 24 | sources: ['ubuntu-toolchain-r-test'] 25 | packages: ['g++-9', 'cmake', 'valgrind', 'doxygen'] 26 | - os: linux 27 | dist: focal 28 | env: CSV_CXX_STANDARD=11 CXX_COMPILER=clang++-11 C_COMPILER=clang-11 29 | compiler: clang 30 | addons: 31 | apt: 32 | sources: 33 | - sourceline: 'deb http://apt.llvm.org/focal/ llvm-toolchain-focal-11 main' 34 | key_url: https://apt.llvm.org/llvm-snapshot.gpg.key 35 | packages: 36 | - clang-11 37 | - os: linux 38 | dist: focal 39 | env: CSV_CXX_STANDARD=14 CXX_COMPILER=clang++-11 C_COMPILER=clang-11 40 | compiler: clang 41 | addons: 42 | apt: 43 | sources: 44 | - sourceline: 'deb http://apt.llvm.org/focal/ llvm-toolchain-focal-11 main' 45 | key_url: https://apt.llvm.org/llvm-snapshot.gpg.key 46 | packages: 47 | - clang-11 48 | - os: linux 49 | dist: focal 50 | env: CSV_CXX_STANDARD=17 CXX_COMPILER=clang++-11 C_COMPILER=clang-11 51 | compiler: clang 52 | addons: 53 | apt: 54 | sources: 55 | - sourceline: 'deb http://apt.llvm.org/focal/ llvm-toolchain-focal-11 main' 56 | key_url: https://apt.llvm.org/llvm-snapshot.gpg.key 57 | packages: 58 | - clang-11 59 | dist: trusty 60 | sudo: required 61 | script: 62 | - export CSV_TEST_ROOT=$PWD/tests 63 | - cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=$CXX_COMPILER -DCMAKE_C_COMPILER=$C_COMPILER -DCSV_CXX_STANDARD=$CSV_CXX_STANDARD 64 | - make csv_test 65 | - ./tests/csv_test 66 | 67 | # Memory leak check 68 | - if [ "$MAIN_BUILD" == "true" ]; then 69 | make csv_stats; 70 | valgrind --leak-check=full ./programs/csv_stats $PWD/tests/data/real_data/2016_Gaz_place_national.txt; 71 | fi; 72 | after_success: 73 | - if [ "$MAIN_BUILD" == "true" ]; then 74 | doxygen Doxyfile; 75 | mv csv_coverage ./docs; 76 | fi; 77 | deploy: 78 | provider: pages:git 79 | edge: true 80 | cleanup: false 81 | token: $GITHUB_TOKEN 82 | keep_history: true 83 | local_dir: docs 84 | target_branch: gh-pages 85 | on: 86 | branch: master 87 | -------------------------------------------------------------------------------- /include/internal/csv_format.cpp: -------------------------------------------------------------------------------- 1 | /** @file 2 | * Defines an object used to store CSV format settings 3 | */ 4 | 5 | #include 6 | #include 7 | 8 | #include "csv_format.hpp" 9 | 10 | namespace csv { 11 | CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { 12 | this->possible_delimiters = { delim }; 13 | this->assert_no_char_overlap(); 14 | return *this; 15 | } 16 | 17 | CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { 18 | this->possible_delimiters = delim; 19 | this->assert_no_char_overlap(); 20 | return *this; 21 | } 22 | 23 | CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { 24 | this->no_quote = false; 25 | this->quote_char = quote; 26 | this->assert_no_char_overlap(); 27 | return *this; 28 | } 29 | 30 | CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { 31 | this->trim_chars = chars; 32 | this->assert_no_char_overlap(); 33 | return *this; 34 | } 35 | 36 | CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { 37 | this->col_names = names; 38 | this->header = -1; 39 | return *this; 40 | } 41 | 42 | CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { 43 | if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; 44 | 45 | this->header = row; 46 | this->col_names = {}; 47 | return *this; 48 | } 49 | 50 | CSV_INLINE void CSVFormat::assert_no_char_overlap() 51 | { 52 | auto delims = std::set( 53 | this->possible_delimiters.begin(), this->possible_delimiters.end()), 54 | trims = std::set( 55 | this->trim_chars.begin(), this->trim_chars.end()); 56 | 57 | // Stores intersection of possible delimiters and trim characters 58 | std::vector intersection = {}; 59 | 60 | // Find which characters overlap, if any 61 | std::set_intersection( 62 | delims.begin(), delims.end(), 63 | trims.begin(), trims.end(), 64 | std::back_inserter(intersection)); 65 | 66 | // Make sure quote character is not contained in possible delimiters 67 | // or whitespace characters 68 | if (delims.find(this->quote_char) != delims.end() || 69 | trims.find(this->quote_char) != trims.end()) { 70 | intersection.push_back(this->quote_char); 71 | } 72 | 73 | if (!intersection.empty()) { 74 | std::string err_msg = "There should be no overlap between the quote character, " 75 | "the set of possible delimiters " 76 | "and the set of whitespace characters. Offending characters: "; 77 | 78 | // Create a pretty error message with the list of overlapping 79 | // characters 80 | for (size_t i = 0; i < intersection.size(); i++) { 81 | err_msg += "'"; 82 | err_msg += intersection[i]; 83 | err_msg += "'"; 84 | 85 | if (i + 1 < intersection.size()) 86 | err_msg += ", "; 87 | } 88 | 89 | throw std::runtime_error(err_msg + '.'); 90 | } 91 | } 92 | } -------------------------------------------------------------------------------- /tests/test_csv_stat.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "csv.hpp" 4 | using namespace csv; 5 | 6 | const std::string PERSONS_CSV = "./tests/data/mimesis_data/persons.csv"; 7 | 8 | // Regression test for #208: Try to parse an empty file shouldn't result in a SEGFAULT 9 | TEST_CASE("Empty File", "[read_csv_stat_empty]") { 10 | bool error_caught = false; 11 | 12 | try { 13 | CSVStat stats("./tests/data/fake_data/empty.csv"); 14 | stats.get_mins(); 15 | } 16 | catch (std::runtime_error& err) { 17 | error_caught = true; 18 | REQUIRE(strcmp(err.what(), "Cannot open file ./tests/data/fake_data/empty.csv") == 0); 19 | } 20 | 21 | REQUIRE(error_caught); 22 | } 23 | 24 | TEST_CASE("Calculating Statistics from Direct Input", "[read_csv_stat_direct]" ) { 25 | std::string int_str; 26 | std::stringstream int_list; 27 | for (int i = 1; i < 101; i++) { 28 | int_str = std::to_string(i); 29 | int_list << int_str << "," << int_str << "," << int_str << "\r\n"; 30 | } 31 | 32 | // Expected results 33 | CSVFormat format; 34 | format.column_names({ "A", "B", "C" }); 35 | 36 | CSVStat reader(int_list, format); 37 | 38 | std::vector means = { 50.5, 50.5, 50.5 }; 39 | std::vector mins = { 1, 1, 1 }; 40 | std::vector maxes = { 100, 100, 100 }; 41 | 42 | REQUIRE( reader.get_mins() == mins ); 43 | REQUIRE( reader.get_maxes() == maxes ); 44 | REQUIRE( reader.get_mean() == means ); 45 | REQUIRE( ceill(reader.get_variance()[0]) == 842 ); 46 | 47 | // Make sure all integers between 1 and 100 have a count of 1 48 | for (int i = 1; i < 101; i++) 49 | REQUIRE( reader.get_counts()[0][std::to_string(i)] == 1 ); 50 | 51 | // Confirm column at pos 0 has 100 integers (type 2) 52 | REQUIRE( reader.get_dtypes()[0][DataType::CSV_INT8] == 100 ); 53 | } 54 | 55 | TEST_CASE( "Statistics - Rows of Integers", "[read_csv_stat]" ) { 56 | // Header on first row 57 | auto file = GENERATE(as {}, 58 | "./tests/data/fake_data/ints.csv", 59 | "./tests/data/fake_data/ints_newline_sep.csv" 60 | ); 61 | 62 | SECTION("Compute Statistics") { 63 | CSVStat reader(file); 64 | 65 | // Expected Results 66 | std::vector means = { 67 | 50.5, 50.5, 50.5, 50.5, 50.5, 68 | 50.5, 50.5, 50.5, 50.5, 50.5 69 | }; 70 | 71 | REQUIRE(reader.get_mean() == means); 72 | REQUIRE(reader.get_mins()[0] == 1); 73 | REQUIRE(reader.get_maxes()[0] == 100); 74 | REQUIRE(ceill(reader.get_variance()[0]) == 842); 75 | } 76 | } 77 | 78 | TEST_CASE( "Statistics - persons.csv", "[test_stat_person]" ) { 79 | CSVStat reader(PERSONS_CSV); 80 | REQUIRE(reader.get_maxes()[0] == 49999); 81 | REQUIRE( ceill(reader.get_mean()[2]) == 42 ); 82 | } 83 | 84 | TEST_CASE("Data Types - persons.csv", "[test_dtypes_person]") { 85 | auto dtypes = csv_data_types(PERSONS_CSV); 86 | 87 | REQUIRE(dtypes["Full Name"] == DataType::CSV_STRING); 88 | REQUIRE(dtypes["Age"] == DataType::CSV_INT8); 89 | REQUIRE(dtypes["Occupation"] == DataType::CSV_STRING); 90 | REQUIRE(dtypes["Email"] == DataType::CSV_STRING); 91 | REQUIRE(dtypes["Telephone"] == DataType::CSV_STRING); 92 | REQUIRE(dtypes["Nationality"] == DataType::CSV_STRING); 93 | } 94 | -------------------------------------------------------------------------------- /docs/source/Doxy.md: -------------------------------------------------------------------------------- 1 | # Vince's CSV Library 2 | 3 | This is the detailed documentation for Vince's CSV library. 4 | For quick examples, go to this project's [GitHub page](https://github.com/vincentlaucsb/csv-parser). 5 | 6 | ## Outline 7 | 8 | ### CSV Reading 9 | * csv::CSVFormat: \copybrief csv::CSVFormat 10 | * csv::CSVReader 11 | * csv::CSVReader::n_rows(): \copybrief csv::CSVReader::n_rows() 12 | * csv::CSVReader::utf8_bom(): \copybrief csv::CSVReader::utf8_bom() 13 | * csv::CSVReader::get_format(): \copybrief csv::CSVReader::get_format() 14 | * Retrieving data 15 | * csv::CSVReader::iterator: Recommended 16 | * csv::CSVReader::begin() 17 | * csv::CSVReader::end() 18 | * csv::CSVReader::read_row() 19 | * Convenience Functions 20 | * csv::parse() 21 | * csv::operator ""_csv() 22 | * csv::parse_no_header() 23 | * csv::operator ""_csv_no_header() 24 | 25 | #### See also 26 | [Dealing with Variable Length CSV Rows](md_docs_source_variable_row_lengths.html) 27 | 28 | #### Working with parsed data 29 | * csv::CSVRow: \copybrief csv::CSVRow 30 | * csv::CSVRow::operator std::vector() 31 | * csv::CSVRow::iterator 32 | * csv::CSVRow::begin() 33 | * csv::CSVRow::end() 34 | * csv::CSVRow::to_json() 35 | * csv::CSVRow::to_json_array() 36 | * csv::CSVField 37 | * csv::CSVField::get(): \copybrief csv::CSVField::get() 38 | * csv::CSVField::operator==() 39 | 40 | ### Statistics 41 | * csv::CSVStat 42 | 43 | ### CSV Writing 44 | * csv::make_csv_writer(): Construct a csv::CSVWriter 45 | * csv::make_tsv_writer(): Construct a csv::TSVWriter 46 | * csv::DelimWriter 47 | * Pre-Defined Specializations 48 | * csv::CSVWriter 49 | * csv::TSVWriter 50 | * Methods 51 | * csv::DelimWriter::operator<<() 52 | 53 | ## Frequently Asked Questions 54 | 55 | ### How does automatic starting row detection work? 56 | See "How does automatic delimiter detection work?" 57 | 58 | ### How does automatic delimiter detection work? 59 | First, the CSV reader attempts to parse the first 100 lines of a CSV file as if the delimiter were a pipe, tab, comma, etc. 60 | Out of all the possible delimiter choices, the delimiter which produces the highest number of `rows * columns` (where all rows 61 | are of a consistent length) is chosen as the winner. 62 | 63 | However, if the CSV file has leading comments, or has less than 100 lines, a second heuristic will be used. The CSV reader again 64 | parses the first 100 lines using each candidate delimiter, but tallies up the length of each row parsed. Then, the delimiter with 65 | the largest most common row length `n` is chosen as the winner, and the line number where the first row of length `n` occurs 66 | is chosen as the starting row. 67 | 68 | Because you can subclass csv::CSVReader, you can implement your own guessing hueristic. csv::internals::CSVGuesser may be used as a helpful guide in doing so. 69 | 70 | ### Is the CSV parser thread-safe? 71 | This library already does a lot of work behind the scenes to use threads to squeeze 72 | performance from your CPU. However, ambitious users who are in the mood for 73 | experimenting should follow these guidelines: 74 | * csv::CSVReader::iterator should only be used from one thread 75 | * A workaround is to chunk blocks of `CSVRow` objects together and 76 | create separate threads to process each column 77 | * csv::CSVRow may be safely processed from multiple threads 78 | * csv::CSVField objects should only be read from one thread at a time 79 | * **Note**: csv::CSVRow::operator[]() produces separate copies of `csv::CSVField` objects -------------------------------------------------------------------------------- /.github/workflows/cmake-multi-platform.yml: -------------------------------------------------------------------------------- 1 | # This starter workflow is for a CMake project running on multiple platforms. There is a different starter workflow if you just want a single platform. 2 | # See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-single-platform.yml 3 | name: CMake on multiple platforms 4 | 5 | on: 6 | push: 7 | branches: [ "master", "memory-fix-csvfieldlist" ] 8 | pull_request: 9 | branches: [ "master" ] 10 | 11 | jobs: 12 | build: 13 | runs-on: ${{ matrix.os }} 14 | 15 | strategy: 16 | # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable. 17 | fail-fast: false 18 | 19 | # Set up a matrix to run the following 3 configurations: 20 | # 1. 21 | # 2. 22 | # 3. 23 | # 24 | # To add more build types (Release, Debug, RelWithDebInfo, etc.) customize the build_type list. 25 | matrix: 26 | os: [windows-latest, ubuntu-latest] 27 | build_type: [Release] 28 | c_compiler: [gcc, cl] 29 | cxx_standard: [17, 20] 30 | include: 31 | - os: windows-latest 32 | c_compiler: cl 33 | cpp_compiler: cl 34 | - os: ubuntu-latest 35 | c_compiler: gcc 36 | cpp_compiler: g++ 37 | exclude: 38 | - os: windows-latest 39 | c_compiler: gcc 40 | - os: ubuntu-latest 41 | c_compiler: cl 42 | 43 | steps: 44 | - name: Checkout repository and submodules 45 | uses: actions/checkout@v4 46 | with: 47 | submodules: recursive 48 | 49 | - name: Set reusable strings 50 | # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file. 51 | id: strings 52 | shell: bash 53 | run: | 54 | echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT" 55 | 56 | - name: Configure CMake 57 | # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make. 58 | # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type 59 | run: > 60 | cmake -B ${{ steps.strings.outputs.build-output-dir }} 61 | -DCSV_CXX_STANDARD=${{ matrix.cxx_standard }} 62 | -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} 63 | -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} 64 | -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} 65 | -S ${{ github.workspace }} 66 | 67 | - name: Build 68 | # Build your program with the given configuration. Note that --config is needed because the default Windows generator is a multi-config generator (Visual Studio generator). 69 | run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} 70 | 71 | - name: Test 72 | working-directory: ${{ steps.strings.outputs.build-output-dir }} 73 | # Execute tests defined by the CMake configuration. Note that --build-config is needed because the default Windows generator is a multi-config generator (Visual Studio generator). 74 | # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail 75 | run: ctest --build-config ${{ matrix.build_type }} 76 | -------------------------------------------------------------------------------- /tests/test_csv_row_json.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "csv.hpp" 3 | 4 | #include 5 | using namespace csv; 6 | 7 | /** Construct a CSVRow object for testing given column names and CSV fields */ 8 | CSVRow make_csv_row(std::vector data, std::vector col_names) { 9 | // Concatenate vector or strings into one large string 10 | using namespace csv::internals; 11 | 12 | std::stringstream raw_csv; 13 | auto writer = make_csv_writer(raw_csv); 14 | writer << col_names; 15 | writer << data; 16 | 17 | CSVReader reader(raw_csv); 18 | CSVRow row; 19 | reader.read_row(row); 20 | 21 | return row; 22 | } 23 | 24 | TEST_CASE("json_escape_string() Test", "[json_escape_string]") { 25 | using internals::json_escape_string; 26 | 27 | // Assert that special characters are escaped properly 28 | REQUIRE(json_escape_string("Quote\"Quote") == "Quote\\\"Quote"); 29 | REQUIRE(json_escape_string("RSolidus\\RSolidus") 30 | == "RSolidus\\\\RSolidus"); 31 | REQUIRE(json_escape_string("Backspace\bBackspace") 32 | == "Backspace\\bBackspace"); 33 | REQUIRE(json_escape_string("Formfeed\fFormfeed") 34 | == "Formfeed\\fFormfeed"); 35 | REQUIRE(json_escape_string("Newline\nNewline") 36 | == "Newline\\nNewline"); 37 | REQUIRE(json_escape_string("CarriageReturn\rCarriageReturn") 38 | == "CarriageReturn\\rCarriageReturn"); 39 | REQUIRE(json_escape_string("Tab\tTab") 40 | == "Tab\\tTab"); 41 | 42 | // Assert that control characters are escaped properly 43 | REQUIRE(json_escape_string("Null\0Null") 44 | == "Null\u0000Null"); 45 | } 46 | 47 | TEST_CASE("CSVRow to_json() Test", "[csv_row_to_json]") { 48 | CSVRow row = make_csv_row( 49 | { "Col 1", "Col 2" }, // Fields 50 | { "A", "B" } // Column names 51 | ); 52 | 53 | REQUIRE(row.to_json() == "{\"A\":\"Col 1\",\"B\":\"Col 2\"}"); 54 | } 55 | 56 | TEST_CASE("CSVRow to_json() Test with Numbers", "[csv_numeric_row_to_json]") { 57 | CSVRow row = make_csv_row( 58 | { "1234.3", "234" }, // Fields 59 | { "A", "B"} // Column names 60 | ); 61 | 62 | REQUIRE(row.to_json() == "{\"A\":1234.3,\"B\":234}"); 63 | } 64 | 65 | TEST_CASE("CSVRow to_json() Test - Mixed", "[csv_mixed_row_to_json]") { 66 | CSVRow row = make_csv_row( 67 | { "1234.3", "234", "ABCD", "AB1", "1337" }, // Fields 68 | { "A", "B", "C", "D", "E" } // Column names 69 | ); 70 | 71 | SECTION("Full Row") { 72 | REQUIRE(row.to_json() == "{\"A\":1234.3,\"B\":234,\"C\":\"ABCD\",\"D\":\"AB1\",\"E\":1337}"); 73 | } 74 | 75 | SECTION("Subset") { 76 | REQUIRE(row.to_json({ "B", "C" }) == "{\"B\":234,\"C\":\"ABCD\"}"); 77 | REQUIRE(row.to_json({ "B", "A" }) == "{\"B\":234,\"A\":1234.3}"); 78 | } 79 | } 80 | 81 | TEST_CASE("CSVRow to_json_array() Test() - Mixed", "[csv_mixed_row_to_json_array]") { 82 | CSVRow row = make_csv_row( 83 | { "1234.3", "234", "ABCD", "AB1", "1337" }, // Fields 84 | { "A", "B", "C", "D", "E" } // Column names 85 | ); 86 | 87 | SECTION("Full Row") { 88 | REQUIRE(row.to_json_array() == "[1234.3,234,\"ABCD\",\"AB1\",1337]"); 89 | } 90 | 91 | SECTION("Subset") { 92 | REQUIRE(row.to_json_array({ "B", "C" }) == "[234,\"ABCD\"]"); 93 | REQUIRE(row.to_json_array({ "B", "A" }) == "[234,1234.3]"); 94 | } 95 | } 96 | 97 | // Reported in: https://github.com/vincentlaucsb/csv-parser/issues/68 98 | TEST_CASE("CSVRow to_json() with Wrong Columns", "[csv_json_wrong_cols]") { 99 | std::stringstream csv_string(R"(A,B,C, 100 | 123,345,678,)"); 101 | 102 | auto format = CSVFormat(); 103 | format.column_names({ "A", "B" }); 104 | 105 | CSVReader reader(csv_string, format); 106 | CSVRow first_row; 107 | reader.read_row(first_row); 108 | 109 | // Since the column names provided were wrong, there won't be any data. 110 | // to_json() method should then produce an empty object instead of segfaulting. 111 | REQUIRE(first_row.to_json() == "{}"); 112 | REQUIRE(first_row.to_json_array() == "[]"); 113 | } -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(csv) 3 | 4 | if(CSV_CXX_STANDARD) 5 | set(CMAKE_CXX_STANDARD ${CSV_CXX_STANDARD}) 6 | else() 7 | set(CMAKE_CXX_STANDARD 17) 8 | endif(CSV_CXX_STANDARD) 9 | 10 | option(BUILD_PYTHON "Build Python Binding" OFF) 11 | 12 | message("Building CSV library using C++${CMAKE_CXX_STANDARD}") 13 | 14 | # Defines CSV_HAS_CXX17 in compatibility.hpp 15 | if (CMAKE_VERSION VERSION_LESS "3.12.0") 16 | add_definitions(-DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}) 17 | else() 18 | add_compile_definitions(CMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}) 19 | endif() 20 | 21 | set(THREADS_PREFER_PTHREAD_FLAG TRUE) 22 | find_package(Threads QUIET REQUIRED) 23 | 24 | if(MSVC) 25 | # Make Visual Studio report accurate C++ version 26 | # See: https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/ 27 | # /Wall emits warnings about the C++ standard library 28 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /GS- /Zc:__cplusplus /W4") 29 | else() 30 | # Ignore Visual Studio pragma regions 31 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas") 32 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} --coverage -Og") 33 | endif(MSVC) 34 | 35 | set(CSV_ROOT_DIR ${CMAKE_CURRENT_LIST_DIR}) 36 | set(CSV_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) 37 | set(CSV_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/include/) 38 | set(CSV_SOURCE_DIR ${CSV_INCLUDE_DIR}/internal/) 39 | set(CSV_TEST_DIR ${CMAKE_CURRENT_LIST_DIR}/tests) 40 | 41 | include_directories(${CSV_INCLUDE_DIR}) 42 | 43 | ## Load developer specific CMake settings 44 | if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) 45 | SET(CSV_DEVELOPER TRUE) 46 | endif() 47 | 48 | ## Main Library 49 | add_subdirectory(${CSV_SOURCE_DIR}) 50 | 51 | # build the python binding for the library 52 | if (${BUILD_PYTHON}) 53 | message("Building Python bindings for the library.") 54 | add_subdirectory(python) 55 | endif() 56 | 57 | ## Executables 58 | option(CSV_BUILD_PROGRAMS "Allow to disable building of programs" ON) 59 | if (CSV_BUILD_PROGRAMS) 60 | add_subdirectory("programs") 61 | endif() 62 | 63 | ## Developer settings 64 | if (CSV_DEVELOPER) 65 | # Allow for performance profiling 66 | if (MSVC) 67 | target_link_options(csv PUBLIC /PROFILE) 68 | endif() 69 | 70 | # More error messages. 71 | if (UNIX) 72 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \ 73 | -Wall -Wextra -Wsign-compare \ 74 | -Wwrite-strings -Wpointer-arith -Winit-self \ 75 | -Wconversion -Wno-sign-conversion") 76 | endif() 77 | 78 | # Generate a single header library 79 | if(CMAKE_VERSION VERSION_LESS "3.12") 80 | find_package(PythonInterp 3 QUIET) 81 | else() 82 | find_package(Python3 COMPONENTS Interpreter) 83 | endif() 84 | if(Python3_Interpreter_FOUND OR PYTHONINTERP_FOUND) 85 | add_custom_target(generate_single_header 86 | COMMAND ${Python3_EXECUTABLE} single_header.py > single_include/csv.hpp 87 | COMMAND ${Python3_EXECUTABLE} single_header.py > single_include_test/csv.hpp 88 | WORKING_DIRECTORY ${CSV_ROOT_DIR} 89 | ) 90 | # Single header compilation test 91 | add_subdirectory(single_include_test) 92 | else() 93 | message(WARNING "Python3 not found, skipping target 'generate_single_header'.") 94 | endif() 95 | 96 | # Documentation 97 | find_package(Doxygen QUIET) 98 | if(DOXYGEN_FOUND) 99 | add_custom_target(doxygen 100 | COMMAND ${DOXYGEN_EXECUTABLE} ${CSV_ROOT_DIR}/Doxyfile 101 | WORKING_DIRECTORY ${CSV_ROOT_DIR} 102 | ) 103 | else() 104 | message(WARNING "Doxygen not found, skipping target 'doxygen'.") 105 | endif() 106 | 107 | ## Tests 108 | enable_testing() 109 | add_subdirectory("tests") 110 | 111 | # Code coverage 112 | #find_program( GCOV_PATH gcov ) 113 | #if(GCOV_PATH) 114 | # set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/modules") 115 | # include(CodeCoverage) 116 | # append_coverage_compiler_flags() 117 | # set(ENV{CSV_TEST_ROOT} ${CSV_TEST_DIR}) 118 | # setup_target_for_coverage_gcovr_html( 119 | # NAME csv_coverage 120 | # EXECUTABLE csv_test 121 | # EXCLUDE "tests/*" 122 | # ) 123 | #endif() 124 | endif() 125 | -------------------------------------------------------------------------------- /tests/test_csv_iterator.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Tests for the CSVRow Iterators and CSVReader Iterators 3 | // 4 | 5 | #include 6 | #include "csv.hpp" 7 | using namespace csv; 8 | 9 | ////////////////////// 10 | // CSVRow Iterators // 11 | ////////////////////// 12 | 13 | TEST_CASE("Test CSVRow Interator", "[test_csv_row_iter]") { 14 | auto rows = "A,B,C\r\n" // Header row 15 | "123,234,345\r\n" 16 | "1,2,3\r\n" 17 | "1,2,3"_csv; 18 | 19 | CSVRow row; 20 | rows.read_row(row); 21 | 22 | SECTION("Forwards and Backwards Iterators") { 23 | // Forwards 24 | REQUIRE(row.begin()->get() == 123); 25 | REQUIRE((row.end() - 1)->get<>() == "345"); 26 | 27 | size_t i = 0; 28 | for (auto it = row.begin(); it != row.end(); ++it) { 29 | if (i == 0) REQUIRE(it->get<>() == "123"); 30 | else if (i == 1) REQUIRE(it->get<>() == "234"); 31 | else REQUIRE(it->get<>() == "345"); 32 | 33 | i++; 34 | } 35 | 36 | // Backwards 37 | REQUIRE(row.rbegin()->get() == 345); 38 | REQUIRE((row.rend() - 1)->get<>() == "123"); 39 | } 40 | 41 | SECTION("Iterator Arithmetic") { 42 | REQUIRE(row.begin()->get() == 123); 43 | REQUIRE((row.end() - 1)->get<>() == "345"); 44 | 45 | auto row_start = row.begin(); 46 | REQUIRE(*(row_start + 1) == "234"); 47 | REQUIRE(*(row_start + 2) == "345"); 48 | 49 | } 50 | 51 | SECTION("Post-Increment Iterator") { 52 | auto it = row.begin(); 53 | 54 | REQUIRE(it++->get() == 123); 55 | REQUIRE(it->get() == 234); 56 | 57 | REQUIRE(it--->get() == 234); 58 | REQUIRE(it->get() == 123); 59 | } 60 | 61 | SECTION("Range Based For") { 62 | size_t i = 0; 63 | for (auto& field : row) { 64 | if (i == 0) REQUIRE(field.get<>() == "123"); 65 | else if (i == 1) REQUIRE(field.get<>() == "234"); 66 | else REQUIRE(field.get<>() == "345"); 67 | 68 | i++; 69 | } 70 | } 71 | } 72 | 73 | ///////////////////////// 74 | // CSVReader Iterators // 75 | ///////////////////////// 76 | 77 | //! [CSVReader Iterator 1] 78 | TEST_CASE("Basic CSVReader Iterator Test", "[read_ints_iter]") { 79 | // A file with 100 rows and columns A, B, ... J 80 | // where every value in the ith row is the number i 81 | CSVReader reader("./tests/data/fake_data/ints.csv"); 82 | std::vector col_names = { 83 | "A", "B", "C", "D", "E", "F", "G", "H", "I", "J" 84 | }; 85 | int i = 1; 86 | 87 | SECTION("Basic Iterator") { 88 | for (auto it = reader.begin(); it != reader.end(); ++it) { 89 | REQUIRE((*it)[0].get() == i); 90 | i++; 91 | } 92 | } 93 | 94 | SECTION("Iterator Post-Increment") { 95 | auto it = reader.begin(); 96 | REQUIRE((it++)->operator[]("A").get() == 1); 97 | REQUIRE(it->operator[]("A").get() == 2); 98 | } 99 | 100 | SECTION("Range-Based For Loop") { 101 | for (auto& row : reader) { 102 | for (auto& j : col_names) REQUIRE(row[j].get() == i); 103 | i++; 104 | } 105 | } 106 | } 107 | //! [CSVReader Iterator 1] 108 | 109 | //! [CSVReader Iterator 2] 110 | TEST_CASE("CSVReader Iterator + std::max_elem", "[iter_max_elem]") { 111 | // The first is such that each value in the ith row is the number i 112 | // There are 100 rows 113 | // The second file is a database of California state employee salaries 114 | CSVReader r1("./tests/data/fake_data/ints.csv"), 115 | r2("./tests/data/real_data/2015_StateDepartment.csv"); 116 | 117 | // Find largest number 118 | auto int_finder = [](CSVRow& left, CSVRow& right) { 119 | return (left["A"].get() < right["A"].get()); 120 | }; 121 | 122 | auto max_int = std::max_element(r1.begin(), r2.end(), int_finder); 123 | 124 | // Find highest salary 125 | auto wage_finder = [](CSVRow& left, CSVRow& right) { 126 | return (left["Total Wages"].get() < right["Total Wages"].get()); 127 | }; 128 | 129 | auto max_wage = std::max_element(r2.begin(), r2.end(), wage_finder); 130 | 131 | REQUIRE((*max_int)["A"] == 100); 132 | REQUIRE((*max_wage)["Total Wages"] == "812064.87"); 133 | } 134 | //! [CSVReader Iterator 2] 135 | -------------------------------------------------------------------------------- /tests/test_write_csv.cpp: -------------------------------------------------------------------------------- 1 | #include // For remove() 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "csv.hpp" 7 | 8 | using namespace csv; 9 | using std::queue; 10 | using std::vector; 11 | using std::string; 12 | 13 | #ifndef __clang__ 14 | TEST_CASE("Numeric Converter Tsts", "[test_convert_number]") { 15 | SECTION("num_digits") { 16 | REQUIRE(csv::internals::num_digits(99.0) == 2); 17 | REQUIRE(csv::internals::num_digits(100.0) == 3); 18 | } 19 | 20 | SECTION("Large Numbers") { 21 | // Large numbers: integer larger than uint64 capacity 22 | REQUIRE(csv::internals::to_string(200000000000000000000.0) == "200000000000000000000.0"); 23 | REQUIRE(csv::internals::to_string(310000000000000000000.0) == "310000000000000000000.0"); 24 | } 25 | 26 | SECTION("Custom Precision") { 27 | // Test setting precision 28 | REQUIRE(csv::internals::to_string(1.234) == "1.23400"); 29 | REQUIRE(csv::internals::to_string(20.0045) == "20.00450"); 30 | 31 | set_decimal_places(2); 32 | REQUIRE(csv::internals::to_string(1.234) == "1.23"); 33 | 34 | // Reset 35 | set_decimal_places(5); 36 | } 37 | 38 | SECTION("Decimal Numbers x where -1 < x < 0") { 39 | REQUIRE(csv::internals::to_string(-0.25) == "-0.25000"); 40 | REQUIRE(csv::internals::to_string(-0.625) == "-0.62500"); 41 | REQUIRE(csv::internals::to_string(-0.666) == "-0.66600"); 42 | } 43 | 44 | SECTION("Numbers Close to 10^n - Regression") { 45 | REQUIRE(csv::internals::to_string(10.0) == "10.0"); 46 | REQUIRE(csv::internals::to_string(100.0) == "100.0"); 47 | REQUIRE(csv::internals::to_string(1000.0) == "1000.0"); 48 | REQUIRE(csv::internals::to_string(10000.0) == "10000.0"); 49 | REQUIRE(csv::internals::to_string(100000.0) == "100000.0"); 50 | REQUIRE(csv::internals::to_string(1000000.0) == "1000000.0"); 51 | } 52 | } 53 | #endif 54 | 55 | TEST_CASE("Basic CSV Writing Cases", "[test_csv_write]") { 56 | std::stringstream output, correct; 57 | auto writer = make_csv_writer(output); 58 | 59 | SECTION("Escaped Comma") { 60 | writer << std::array({ "Furthermore, this should be quoted." }); 61 | correct << "\"Furthermore, this should be quoted.\""; 62 | } 63 | 64 | SECTION("Quote Escape") { 65 | writer << std::array({ "\"What does it mean to be RFC 4180 compliant?\" she asked." }); 66 | correct << "\"\"\"What does it mean to be RFC 4180 compliant?\"\" she asked.\""; 67 | } 68 | 69 | SECTION("Newline Escape") { 70 | writer << std::array({ "Line 1\nLine2" }); 71 | correct << "\"Line 1\nLine2\""; 72 | } 73 | 74 | SECTION("Leading and Trailing Quote Escape") { 75 | writer << std::array({ "\"\"" }); 76 | correct << "\"\"\"\"\"\""; 77 | } 78 | 79 | SECTION("Quote Minimal") { 80 | writer << std::array({ "This should not be quoted" }); 81 | correct << "This should not be quoted"; 82 | } 83 | 84 | correct << std::endl; 85 | REQUIRE(output.str() == correct.str()); 86 | } 87 | 88 | TEST_CASE("CSV Quote All", "[test_csv_quote_all]") { 89 | std::stringstream output, correct; 90 | auto writer = make_csv_writer(output, false); 91 | 92 | writer << std::array({ "This should be quoted" }); 93 | correct << "\"This should be quoted\"" << std::endl; 94 | 95 | REQUIRE(output.str() == correct.str()); 96 | } 97 | 98 | //! [CSV Writer Example] 99 | TEMPLATE_TEST_CASE("CSV/TSV Writer - operator <<", "[test_csv_operator<<]", 100 | std::vector, std::deque, std::list) { 101 | std::stringstream output, correct_comma, correct_tab; 102 | 103 | // Build correct strings 104 | correct_comma << "A,B,C" << std::endl << "\"1,1\",2,3" << std::endl; 105 | correct_tab << "A\tB\tC" << std::endl << "1,1\t2\t3" << std::endl; 106 | 107 | // Test input 108 | auto test_row_1 = TestType({ "A", "B", "C" }), 109 | test_row_2 = TestType({ "1,1", "2", "3" }); 110 | 111 | SECTION("CSV Writer") { 112 | auto csv_writer = make_csv_writer(output); 113 | csv_writer << test_row_1 << test_row_2; 114 | 115 | REQUIRE(output.str() == correct_comma.str()); 116 | } 117 | 118 | SECTION("TSV Writer") { 119 | auto tsv_writer = make_tsv_writer(output); 120 | tsv_writer << test_row_1 << test_row_2; 121 | 122 | REQUIRE(output.str() == correct_tab.str()); 123 | } 124 | } 125 | //! [CSV Writer Example] 126 | 127 | //! [CSV Writer Tuple Example] 128 | struct Time { 129 | std::string hour; 130 | std::string minute; 131 | 132 | operator std::string() const { 133 | std::string ret = hour; 134 | ret += ":"; 135 | ret += minute; 136 | 137 | return ret; 138 | } 139 | }; 140 | 141 | #ifndef __clang__ 142 | TEST_CASE("CSV Tuple", "[test_csv_tuple]") { 143 | #ifdef CSV_HAS_CXX17 144 | Time time = { "5", "30" }; 145 | #else 146 | std::string time = "5:30"; 147 | #endif 148 | std::stringstream output, correct_output; 149 | auto csv_writer = make_csv_writer(output); 150 | 151 | csv_writer << std::make_tuple("One", 2, "Three", 4.0, time) 152 | << std::make_tuple("One", (short)2, "Three", 4.0f, time) 153 | << std::make_tuple(-1, -2.0) 154 | << std::make_tuple(20.2, -20.3, -20.123) 155 | << std::make_tuple(0.0, 0.0f, 0); 156 | 157 | correct_output << "One,2,Three,4.0,5:30" << std::endl 158 | << "One,2,Three,4.0,5:30" << std::endl 159 | << "-1,-2.0" << std::endl 160 | << "20.19999,-20.30000,-20.12300" << std::endl 161 | << "0.0,0.0,0" << std::endl; 162 | 163 | REQUIRE(output.str() == correct_output.str()); 164 | } 165 | #endif 166 | //! [CSV Writer Tuple Example] 167 | -------------------------------------------------------------------------------- /include/internal/csv_format.hpp: -------------------------------------------------------------------------------- 1 | /** @file 2 | * Defines an object used to store CSV format settings 3 | */ 4 | 5 | #pragma once 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "common.hpp" 12 | 13 | namespace csv { 14 | namespace internals { 15 | class IBasicCSVParser; 16 | } 17 | 18 | class CSVReader; 19 | 20 | /** Determines how to handle rows that are shorter or longer than the majority */ 21 | enum class VariableColumnPolicy { 22 | THROW = -1, 23 | IGNORE_ROW = 0, 24 | KEEP = 1 25 | }; 26 | 27 | /** Stores the inferred format of a CSV file. */ 28 | struct CSVGuessResult { 29 | char delim; 30 | int header_row; 31 | }; 32 | 33 | /** Stores information about how to parse a CSV file. 34 | * Can be used to construct a csv::CSVReader. 35 | */ 36 | class CSVFormat { 37 | public: 38 | /** Settings for parsing a RFC 4180 CSV file */ 39 | CSVFormat() = default; 40 | 41 | /** Sets the delimiter of the CSV file 42 | * 43 | * @throws `std::runtime_error` thrown if trim, quote, or possible delimiting characters overlap 44 | */ 45 | CSVFormat& delimiter(char delim); 46 | 47 | /** Sets a list of potential delimiters 48 | * 49 | * @throws `std::runtime_error` thrown if trim, quote, or possible delimiting characters overlap 50 | * @param[in] delim An array of possible delimiters to try parsing the CSV with 51 | */ 52 | CSVFormat& delimiter(const std::vector & delim); 53 | 54 | /** Sets the whitespace characters to be trimmed 55 | * 56 | * @throws `std::runtime_error` thrown if trim, quote, or possible delimiting characters overlap 57 | * @param[in] ws An array of whitespace characters that should be trimmed 58 | */ 59 | CSVFormat& trim(const std::vector & ws); 60 | 61 | /** Sets the quote character 62 | * 63 | * @throws `std::runtime_error` thrown if trim, quote, or possible delimiting characters overlap 64 | */ 65 | CSVFormat& quote(char quote); 66 | 67 | /** Sets the column names. 68 | * 69 | * @note Unsets any values set by header_row() 70 | */ 71 | CSVFormat& column_names(const std::vector& names); 72 | 73 | /** Sets the header row 74 | * 75 | * @note Unsets any values set by column_names() 76 | */ 77 | CSVFormat& header_row(int row); 78 | 79 | /** Tells the parser that this CSV has no header row 80 | * 81 | * @note Equivalent to `header_row(-1)` 82 | * 83 | */ 84 | CSVFormat& no_header() { 85 | this->header_row(-1); 86 | return *this; 87 | } 88 | 89 | /** Turn quoting on or off */ 90 | CSVFormat& quote(bool use_quote) { 91 | this->no_quote = !use_quote; 92 | return *this; 93 | } 94 | 95 | /** Tells the parser how to handle columns of a different length than the others */ 96 | CONSTEXPR_14 CSVFormat& variable_columns(VariableColumnPolicy policy = VariableColumnPolicy::IGNORE_ROW) { 97 | this->variable_column_policy = policy; 98 | return *this; 99 | } 100 | 101 | /** Tells the parser how to handle columns of a different length than the others */ 102 | CONSTEXPR_14 CSVFormat& variable_columns(bool policy) { 103 | this->variable_column_policy = (VariableColumnPolicy)policy; 104 | return *this; 105 | } 106 | 107 | #ifndef DOXYGEN_SHOULD_SKIP_THIS 108 | char get_delim() const { 109 | // This error should never be received by end users. 110 | if (this->possible_delimiters.size() > 1) { 111 | throw std::runtime_error("There is more than one possible delimiter."); 112 | } 113 | 114 | return this->possible_delimiters.at(0); 115 | } 116 | 117 | CONSTEXPR bool is_quoting_enabled() const { return !this->no_quote; } 118 | CONSTEXPR char get_quote_char() const { return this->quote_char; } 119 | CONSTEXPR int get_header() const { return this->header; } 120 | std::vector get_possible_delims() const { return this->possible_delimiters; } 121 | std::vector get_trim_chars() const { return this->trim_chars; } 122 | CONSTEXPR VariableColumnPolicy get_variable_column_policy() const { return this->variable_column_policy; } 123 | #endif 124 | 125 | /** CSVFormat for guessing the delimiter */ 126 | CSV_INLINE static CSVFormat guess_csv() { 127 | CSVFormat format; 128 | format.delimiter({ ',', '|', '\t', ';', '^' }) 129 | .quote('"') 130 | .header_row(0); 131 | 132 | return format; 133 | } 134 | 135 | bool guess_delim() { 136 | return this->possible_delimiters.size() > 1; 137 | } 138 | 139 | friend CSVReader; 140 | friend internals::IBasicCSVParser; 141 | 142 | private: 143 | /**< Throws an error if delimiters and trim characters overlap */ 144 | void assert_no_char_overlap(); 145 | 146 | /**< Set of possible delimiters */ 147 | std::vector possible_delimiters = { ',' }; 148 | 149 | /**< Set of whitespace characters to trim */ 150 | std::vector trim_chars = {}; 151 | 152 | /**< Row number with columns (ignored if col_names is non-empty) */ 153 | int header = 0; 154 | 155 | /**< Whether or not to use quoting */ 156 | bool no_quote = false; 157 | 158 | /**< Quote character */ 159 | char quote_char = '"'; 160 | 161 | /**< Should be left empty unless file doesn't include header */ 162 | std::vector col_names = {}; 163 | 164 | /**< Allow variable length columns? */ 165 | VariableColumnPolicy variable_column_policy = VariableColumnPolicy::IGNORE_ROW; 166 | }; 167 | } -------------------------------------------------------------------------------- /tests/test_data_type.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "csv.hpp" 3 | #include 4 | 5 | #include "./shared/float_test_cases.hpp" 6 | 7 | using namespace csv; 8 | using namespace csv::internals; 9 | 10 | TEST_CASE( "Recognize Integers Properly", "[dtype_int]" ) { 11 | std::string a("1"), b(" 2018 "), c(" -69 "); 12 | long double out = 0; 13 | 14 | REQUIRE(data_type(a, &out) == DataType::CSV_INT8); 15 | REQUIRE(out == 1); 16 | 17 | REQUIRE(data_type(b, &out) == DataType::CSV_INT16); 18 | REQUIRE(out == 2018); 19 | 20 | REQUIRE(data_type(c, &out) == DataType::CSV_INT8); 21 | REQUIRE(out == -69); 22 | } 23 | 24 | TEST_CASE( "Recognize Strings Properly", "[dtype_str]" ) { 25 | auto str = GENERATE(as {}, "test", "999.999.9999", "510-123-4567", "510 123", "510 123 4567"); 26 | 27 | SECTION("String Recognition") { 28 | REQUIRE(data_type(str) == DataType::CSV_STRING); 29 | } 30 | } 31 | 32 | TEST_CASE( "Recognize Null Properly", "[dtype_null]" ) { 33 | std::string null_str(""); 34 | REQUIRE( data_type(null_str) == DataType::CSV_NULL ); 35 | } 36 | 37 | TEST_CASE( "Recognize Floats Properly", "[dtype_float]" ) { 38 | using std::make_tuple; 39 | 40 | SECTION("Parse One Float") { 41 | std::string input; 42 | long double out = 0; 43 | long double expected = 0; 44 | 45 | std::tie(input, expected) = 46 | GENERATE(table( 47 | csv_test::FLOAT_TEST_CASES)); 48 | 49 | REQUIRE(data_type(input, &out) == DataType::CSV_DOUBLE); 50 | REQUIRE(is_equal(out, expected)); 51 | } 52 | } 53 | 54 | TEST_CASE("Integer Size Recognition", "[int_sizes]") { 55 | std::string s; 56 | long double out = 0; 57 | 58 | SECTION("Boundary Values") { 59 | s = std::to_string((long long)csv::internals::CSV_INT8_MAX); 60 | REQUIRE(data_type(s, &out) == DataType::CSV_INT8); 61 | REQUIRE(out == (long long)CSV_INT8_MAX); 62 | 63 | s = std::to_string((long long)csv::internals::CSV_INT16_MAX); 64 | REQUIRE(data_type(s, &out) == DataType::CSV_INT16); 65 | REQUIRE(out == (long long)CSV_INT16_MAX); 66 | 67 | s = std::to_string((long long)csv::internals::CSV_INT32_MAX); 68 | REQUIRE(data_type(s, &out) == DataType::CSV_INT32); 69 | REQUIRE(out == (long long)CSV_INT32_MAX); 70 | 71 | // Note: data_type() doesn't have enough precision for CSV_INT64 72 | } 73 | 74 | SECTION("Integer Overflow") { 75 | s = std::to_string((long long)csv::internals::CSV_INT16_MAX + 1); 76 | REQUIRE(data_type(s, &out) == DataType::CSV_INT32); 77 | REQUIRE(out == (long long)CSV_INT16_MAX + 1); 78 | 79 | s = std::to_string((long long)csv::internals::CSV_INT32_MAX + 1); 80 | REQUIRE(data_type(s, &out) == DataType::CSV_INT64); 81 | REQUIRE(out == (long long)CSV_INT32_MAX + 1); 82 | 83 | // Case: Integer too large to fit in int64 --> store in long double 84 | s = std::to_string((long long)csv::internals::CSV_INT64_MAX); 85 | s.append("1"); 86 | REQUIRE(data_type(s, &out) == DataType::CSV_BIGINT); 87 | } 88 | } 89 | 90 | TEST_CASE( "Recognize Sub-Unit Double Values", "[regression_double]" ) { 91 | std::string s("0.15"); 92 | long double out = 0; 93 | REQUIRE(data_type(s, &out) == DataType::CSV_DOUBLE); 94 | REQUIRE(is_equal(out, 0.15L)); 95 | } 96 | 97 | TEST_CASE( "Recognize Double Values", "[regression_double2]" ) { 98 | // Test converting double values back and forth 99 | long double out = -1.0; 100 | std::string s; 101 | 102 | for (long double i = 0; i <= 2.0; i += 0.01) { 103 | s = std::to_string(i); 104 | REQUIRE(data_type(s, &out) == DataType::CSV_DOUBLE); 105 | REQUIRE(is_equal(out, i)); 106 | } 107 | } 108 | 109 | //! [Parse Scientific Notation] 110 | TEST_CASE("Parse Scientific Notation", "[e_notation]") { 111 | // Test parsing e notation 112 | long double out = 0; 113 | 114 | REQUIRE(data_type("1E-06", &out) == DataType::CSV_DOUBLE); 115 | REQUIRE(is_equal(out, 0.000001L)); 116 | 117 | REQUIRE(data_type("1e-06", &out) == DataType::CSV_DOUBLE); 118 | REQUIRE(is_equal(out, 0.000001L)); 119 | 120 | REQUIRE(data_type("2.17222E+02", &out) == DataType::CSV_DOUBLE); 121 | REQUIRE(is_equal(out, 217.222L)); 122 | 123 | REQUIRE(data_type("4.55E+10", &out) == DataType::CSV_DOUBLE); 124 | REQUIRE(is_equal(out, 45500000000.0L)); 125 | 126 | REQUIRE(data_type("4.55E+11", &out) == DataType::CSV_DOUBLE); 127 | REQUIRE(is_equal(out, 455000000000.0L)); 128 | 129 | REQUIRE(data_type("4.55E-1", &out) == DataType::CSV_DOUBLE); 130 | REQUIRE(is_equal(out, 0.455L)); 131 | 132 | REQUIRE(data_type("4.55E-5", &out) == DataType::CSV_DOUBLE); 133 | REQUIRE(is_equal(out, 0.0000455L)); 134 | 135 | REQUIRE(data_type("4.55E-000000000005", &out) == DataType::CSV_DOUBLE); 136 | REQUIRE(is_equal(out, 0.0000455L)); 137 | } 138 | //! [Parse Scientific Notation] 139 | 140 | //! [Scientific Notation Flavors] 141 | TEST_CASE("Parse Different Flavors of Scientific Notation", "[sci_notation_diversity]") { 142 | auto number = GENERATE(as {}, 143 | "4.55e5", "4.55E5", 144 | "4.55E+5", "4.55e+5", 145 | "4.55E+05", 146 | "4.55e0000005", "4.55E0000005", 147 | "4.55e+0000005", "4.55E+0000005"); 148 | 149 | SECTION("Recognize 455 thousand") { 150 | long double out = 0; 151 | REQUIRE(data_type(number, &out) == DataType::CSV_DOUBLE); 152 | REQUIRE(is_equal(out, 455000.0L)); 153 | } 154 | } 155 | //! [Scientific Notation Flavors] 156 | 157 | TEST_CASE("Parse Scientific Notation Malformed", "[sci_notation]") { 158 | // Assert parsing butchered scientific notation won't cause a 159 | // crash or any other weird side effects 160 | auto butchered = GENERATE(as{}, 161 | "4.55E000a", 162 | "4.55000x40", 163 | "4.55000E40E40"); 164 | 165 | SECTION("Butchered Parsing Attempt") { 166 | REQUIRE(data_type(butchered) == DataType::CSV_STRING); 167 | } 168 | } 169 | 170 | TEST_CASE( "Parse numbers with dash as string", "[regression_double]" ) { 171 | std::string s("510-123-4567"); 172 | long double out = 0; 173 | REQUIRE(data_type(s, &out) == DataType::CSV_STRING); 174 | } 175 | -------------------------------------------------------------------------------- /tests/test_read_csv_file.cpp: -------------------------------------------------------------------------------- 1 | /** @file 2 | * Tests for CSV parsing 3 | */ 4 | 5 | #include // remove() 6 | #include 7 | #include 8 | #include "csv.hpp" 9 | 10 | using namespace csv; 11 | using std::vector; 12 | using std::string; 13 | 14 | TEST_CASE("col_pos() Test", "[test_col_pos]") { 15 | int pos = get_col_pos( 16 | "./tests/data/real_data/2015_StateDepartment.csv", 17 | "Entity Type"); 18 | REQUIRE(pos == 1); 19 | } 20 | 21 | TEST_CASE("Prevent Column Names From Being Overwritten", "[csv_col_names_overwrite]") { 22 | std::vector column_names = { "A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10" }; 23 | 24 | // Test against a variety of different CSVFormat objects 25 | std::vector formats = {}; 26 | formats.push_back(CSVFormat::guess_csv()); 27 | formats.push_back(CSVFormat()); 28 | formats.back().delimiter(std::vector({ ',', '\t', '|'})); 29 | formats.push_back(CSVFormat()); 30 | formats.back().delimiter(std::vector({ ',', '~'})); 31 | 32 | for (auto& format_in : formats) { 33 | // Set up the CSVReader 34 | format_in.column_names(column_names); 35 | CSVReader reader("./tests/data/fake_data/ints_comments.csv", format_in); 36 | 37 | // Assert that column names weren't overwritten 38 | CSVFormat format_out = reader.get_format(); 39 | REQUIRE(reader.get_col_names() == column_names); 40 | REQUIRE(format_out.get_delim() == ','); 41 | REQUIRE(format_out.get_header() == 5); 42 | } 43 | } 44 | 45 | // get_file_info() 46 | TEST_CASE("get_file_info() Test", "[test_file_info]") { 47 | SECTION("ints.csv") { 48 | CSVFileInfo info = get_file_info( 49 | "./tests/data/fake_data/ints.csv"); 50 | 51 | REQUIRE(info.delim == ','); 52 | REQUIRE(info.n_rows == 100); 53 | } 54 | 55 | SECTION("2009PowerStatus.txt") { 56 | CSVFileInfo info = get_file_info( 57 | "./tests/data/real_data/2009PowerStatus.txt"); 58 | 59 | REQUIRE(info.delim == '|'); 60 | REQUIRE(info.n_rows == 37960); // Can confirm with Excel 61 | REQUIRE(info.n_cols == 3); 62 | REQUIRE(info.col_names == vector({ "ReportDt", "Unit", "Power" })); 63 | } 64 | } 65 | 66 | TEST_CASE("Non-Existent CSV", "[read_ghost_csv]") { 67 | // Make sure attempting to parse a non-existent CSV throws an error 68 | bool error_caught = false; 69 | 70 | try { 71 | CSVReader reader("./lochness.csv"); 72 | } 73 | catch (std::runtime_error& err) { 74 | error_caught = true; 75 | REQUIRE(err.what() == std::string("Cannot open file ./lochness.csv")); 76 | } 77 | 78 | REQUIRE(error_caught); 79 | } 80 | 81 | TEST_CASE("Test Read CSV where file does NOT end with newline", "[test_file_info_ints2]") { 82 | CSVReader reader("./tests/data/fake_data/ints_doesnt_end_in_newline.csv"); 83 | 84 | auto row = reader.begin(); 85 | for (; row != reader.end(); row++) {} // skip to end 86 | 87 | REQUIRE((*row)["A"] == 100); 88 | REQUIRE((*row)["J"] == 100); 89 | } 90 | 91 | TEST_CASE( "Test Read CSV with Header Row", "[read_csv_header]" ) { 92 | // Header on first row 93 | constexpr auto path = "./tests/data/real_data/2015_StateDepartment.csv"; 94 | 95 | // Test using memory mapped IO and std::ifstream 96 | std::vector readers = {}; 97 | std::ifstream infile(path, std::ios::binary); 98 | 99 | readers.emplace_back(path, CSVFormat()); // Memory mapped 100 | readers.emplace_back(infile, CSVFormat()); 101 | 102 | for (auto& reader : readers) { 103 | CSVRow row; 104 | reader.read_row(row); // Populate row with first line 105 | 106 | // Expected Results 107 | vector col_names = { 108 | "Year", "Entity Type", "Entity Group", "Entity Name", 109 | "Department / Subdivision", "Position", "Elected Official", 110 | "Judicial", "Other Positions", "Min Classification Salary", 111 | "Max Classification Salary", "Reported Base Wage", "Regular Pay", 112 | "Overtime Pay", "Lump-Sum Pay", "Other Pay", "Total Wages", 113 | "Defined Benefit Plan Contribution", "Employees Retirement Cost Covered", 114 | "Deferred Compensation Plan", "Health Dental Vision", 115 | "Total Retirement and Health Cost", "Pension Formula", 116 | "Entity URL", "Entity Population", "Last Updated", 117 | "Entity County", "Special District Activities" 118 | }; 119 | 120 | vector first_row = { 121 | "2015","State Department","","Administrative Law, Office of","", 122 | "Assistant Chief Counsel","False","False","","112044","129780","" 123 | ,"133020.06","0","2551.59","2434.8","138006.45","34128.65","0","0" 124 | ,"15273.97","49402.62","2.00% @ 55","http://www.spb.ca.gov/","", 125 | "08/02/2016","","" 126 | }; 127 | 128 | REQUIRE(vector(row) == first_row); 129 | REQUIRE(reader.get_col_names() == col_names); 130 | 131 | // Skip to end 132 | while (reader.read_row(row)); 133 | REQUIRE(reader.n_rows() == 246497); 134 | } 135 | } 136 | 137 | // 138 | // read_row() 139 | // 140 | //! [CSVField Example] 141 | TEST_CASE("Test read_row() CSVField - Easy", "[read_row_csvf1]") { 142 | // Test that integers are type-casted properly 143 | CSVReader reader("./tests/data/fake_data/ints.csv"); 144 | CSVRow row; 145 | 146 | while (reader.read_row(row)) { 147 | for (size_t i = 0; i < row.size(); i++) { 148 | REQUIRE(row[i].is_int()); 149 | REQUIRE(row[i].get() <= 100); 150 | } 151 | } 152 | } 153 | //! [CSVField Example] 154 | 155 | TEST_CASE("Test read_row() CSVField - Power Status", "[read_row_csvf3]") { 156 | CSVReader reader("./tests/data/real_data/2009PowerStatus.txt"); 157 | CSVRow row; 158 | 159 | size_t date = reader.index_of("ReportDt"), 160 | unit = reader.index_of("Unit"), 161 | power = reader.index_of("Power"); 162 | 163 | // Try to find a non-existent column 164 | REQUIRE(reader.index_of("metallica") == CSV_NOT_FOUND); 165 | 166 | for (size_t i = 0; reader.read_row(row); i++) { 167 | // Assert correct types 168 | REQUIRE(row[date].is_str()); 169 | REQUIRE(row[unit].is_str()); 170 | REQUIRE(row[power].is_int()); 171 | 172 | // Spot check 173 | if (i == 2) { 174 | REQUIRE(row[power].get() == 100); 175 | REQUIRE(row[date].get<>() == "12/31/2009"); // string_view 176 | REQUIRE(row[unit].get() == "Beaver Valley 1"); 177 | } 178 | } 179 | } -------------------------------------------------------------------------------- /tests/test_csv_field.cpp: -------------------------------------------------------------------------------- 1 | #include "csv.hpp" 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace csv; 7 | 8 | #include "./shared/float_test_cases.hpp" 9 | 10 | TEMPLATE_TEST_CASE("CSVField get<> - String Value", "[test_csv_field_get_string]", 11 | signed char, short int, int, long long int, double, long double) { 12 | CSVField field("applesauce"); 13 | REQUIRE(field.get<>() == "applesauce"); 14 | 15 | // Assert that improper conversions attempts are thwarted 16 | bool ex_caught = false; 17 | try { 18 | field.get(); 19 | } 20 | catch (std::runtime_error& err) { 21 | REQUIRE(err.what() == csv::internals::ERROR_NAN); 22 | ex_caught = true; 23 | } 24 | 25 | REQUIRE(ex_caught); 26 | } 27 | 28 | TEST_CASE("CSVField get<> - Error Messages", "[test_csv_field_get_error]") { 29 | CSVField field("applesauce"); 30 | 31 | bool ex_caught = false; 32 | try { 33 | field.get(); 34 | } 35 | catch (std::runtime_error& err) { 36 | REQUIRE(err.what() == csv::internals::ERROR_NAN); 37 | ex_caught = true; 38 | } 39 | 40 | REQUIRE(ex_caught); 41 | } 42 | 43 | TEST_CASE("CSVField get<>() - Integral Value", "[test_csv_field_get_int]") { 44 | CSVField this_year("2019"); 45 | REQUIRE(this_year.get<>() == "2019"); 46 | REQUIRE(this_year.get() == "2019"); 47 | REQUIRE(this_year.get() == 2019); 48 | REQUIRE(this_year.get() == 2019); 49 | REQUIRE(this_year.get() == 2019.0f); 50 | REQUIRE(this_year.get() == 2019.0); 51 | REQUIRE(this_year.get() == 2019l); 52 | 53 | bool ex_caught = false; 54 | try { 55 | this_year.get(); 56 | } 57 | catch (std::runtime_error& err) { 58 | REQUIRE(err.what() == csv::internals::ERROR_OVERFLOW); 59 | ex_caught = true; 60 | } 61 | 62 | REQUIRE(ex_caught); 63 | } 64 | 65 | TEST_CASE("CSVField get<>() - Integer Boundary Value", "[test_csv_field_get_boundary]") { 66 | // Note: Tests may fail if compiler defines typenames differently than 67 | // Microsoft/GCC/clang 68 | REQUIRE(CSVField("127").get() == 127); 69 | REQUIRE(CSVField("32767").get() == 32767); 70 | REQUIRE(CSVField("2147483647").get() == 2147483647); 71 | 72 | REQUIRE(CSVField("255").get() == 255); 73 | REQUIRE(CSVField("65535").get() == 65535); 74 | REQUIRE(CSVField("4294967295").get() == 4294967295); 75 | } 76 | 77 | // Test converting a small integer to unsigned and signed integer types 78 | TEMPLATE_TEST_CASE("CSVField get<>() - Integral Value to Int", "[test_csv_field_convert_int]", 79 | unsigned char, unsigned short, unsigned int, unsigned long long, 80 | char, short, int, long long int) { 81 | CSVField savage("21"); 82 | REQUIRE(savage.get() == 21); 83 | } 84 | 85 | TEST_CASE("CSVField get<>() - Floating Point Value", "[test_csv_field_get_float]") { 86 | SECTION("Test get() with various float types") { 87 | CSVField euler("2.718"); 88 | REQUIRE(euler.get<>() == "2.718"); 89 | REQUIRE(euler.get() == "2.718"); 90 | REQUIRE(euler.get() == 2.718f); 91 | REQUIRE(euler.get() == 2.718); 92 | REQUIRE(euler.get() == 2.718l); 93 | } 94 | 95 | SECTION("Test get() with various values") { 96 | std::string input; 97 | long double expected = 0; 98 | 99 | std::tie(input, expected) = 100 | GENERATE(table( 101 | csv_test::FLOAT_TEST_CASES)); 102 | 103 | CSVField testField(input); 104 | 105 | REQUIRE(internals::is_equal(testField.get(), expected)); 106 | } 107 | } 108 | 109 | TEST_CASE("CSVField try_parse_hex()", "[test_csv_field_parse_hex]") { 110 | int value = 0; 111 | 112 | SECTION("Valid Hex Values") { 113 | std::unordered_map test_cases = { 114 | {" A ", 10}, 115 | {"0A", 10}, 116 | {"0B", 11}, 117 | {"0C", 12}, 118 | {"0D", 13}, 119 | {"0E", 14}, 120 | {"0F", 15}, 121 | {"FF", 255}, 122 | {"B00B5", 721077}, 123 | {"D3ADB33F", 3551376191}, 124 | {" D3ADB33F ", 3551376191} 125 | }; 126 | 127 | for (auto& _case : test_cases) { 128 | REQUIRE(CSVField(_case.first).try_parse_hex(value)); 129 | REQUIRE(value == _case.second); 130 | } 131 | } 132 | 133 | SECTION("Invalid Values") { 134 | std::vector invalid_test_cases = { 135 | "", " ", "carneasda", "carne asada", "0fg" 136 | }; 137 | 138 | for (auto& _case : invalid_test_cases) { 139 | REQUIRE(CSVField(_case).try_parse_hex(value) == false); 140 | } 141 | } 142 | } 143 | 144 | 145 | TEST_CASE("CSVField try_parse_decimal()", "[test_csv_field_parse_hex]") { 146 | SECTION("Test try_parse_decimal() with non-numeric value") { 147 | long double output = 0; 148 | std::string input = "stroustrup"; 149 | CSVField testField(input); 150 | 151 | REQUIRE(testField.try_parse_decimal(output, ',') == false); 152 | REQUIRE(testField.type() == DataType::CSV_STRING); 153 | } 154 | 155 | SECTION("Test try_parse_decimal() with integer value") { 156 | long double output = 0; 157 | std::string input = "2024"; 158 | CSVField testField(input); 159 | 160 | REQUIRE(testField.try_parse_decimal(output, ',') == true); 161 | REQUIRE(testField.type() == DataType::CSV_INT16); 162 | REQUIRE(internals::is_equal(output, 2024.0l)); 163 | } 164 | 165 | SECTION("Test try_parse_decimal() with various valid values") { 166 | std::string input; 167 | long double output = 0; 168 | long double expected = 0; 169 | 170 | std::tie(input, expected) = 171 | GENERATE(table( 172 | csv_test::FLOAT_TEST_CASES)); 173 | 174 | // Replace '.' with ',' 175 | std::replace(input.begin(), input.end(), '.', ','); 176 | 177 | CSVField testField(input); 178 | 179 | REQUIRE(testField.try_parse_decimal(output, ',') == true); 180 | REQUIRE(testField.type() == DataType::CSV_DOUBLE); 181 | REQUIRE(internals::is_equal(output, expected)); 182 | } 183 | } 184 | 185 | TEMPLATE_TEST_CASE("CSVField get<>() - Disallow Float to Int", "[test_csv_field_get_float_as_int]", 186 | unsigned char, unsigned short, unsigned int, unsigned long long int, 187 | signed char, short, int, long long int) { 188 | CSVField euler("2.718"); 189 | bool ex_caught = false; 190 | 191 | try { 192 | euler.get(); 193 | } 194 | catch (std::runtime_error& err) { 195 | REQUIRE(err.what() == csv::internals::ERROR_FLOAT_TO_INT); 196 | ex_caught = true; 197 | } 198 | 199 | REQUIRE(ex_caught); 200 | } 201 | 202 | TEMPLATE_TEST_CASE("CSVField get<>() - Disallow Negative to Unsigned", "[test_csv_field_no_unsigned_neg]", 203 | unsigned char, unsigned short, unsigned int, unsigned long long int) { 204 | CSVField neg("-1337"); 205 | bool ex_caught = false; 206 | 207 | try { 208 | neg.get(); 209 | } 210 | catch (std::runtime_error& err) { 211 | REQUIRE(err.what() == csv::internals::ERROR_NEG_TO_UNSIGNED); 212 | ex_caught = true; 213 | } 214 | 215 | REQUIRE(ex_caught); 216 | } 217 | 218 | TEST_CASE("CSVField Equality Operator", "[test_csv_field_operator==]") { 219 | CSVField field("3.14"); 220 | REQUIRE(field == "3.14"); 221 | REQUIRE(field == 3.14f); 222 | REQUIRE(field == 3.14); 223 | } -------------------------------------------------------------------------------- /tests/test_raw_csv_data.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "internal/basic_csv_parser.hpp" 3 | #include "internal/csv_row.hpp" 4 | 5 | #include 6 | 7 | using namespace csv; 8 | using namespace csv::internals; 9 | using RowCollectionTest = ThreadSafeDeque; 10 | 11 | TEST_CASE("Basic CSV Parse Test", "[raw_csv_parse]") { 12 | std::stringstream csv("A,B,C\r\n" 13 | "123,234,345\r\n" 14 | "1,2,3\r\n" 15 | "1,2,3"); 16 | 17 | RowCollectionTest rows; 18 | 19 | StreamParser parser( 20 | csv, 21 | internals::make_parse_flags(',', '"'), 22 | internals::WhitespaceMap() 23 | ); 24 | 25 | parser.set_output(rows); 26 | parser.next(); 27 | 28 | auto row = rows.front(); 29 | REQUIRE(row[0] == "A"); 30 | REQUIRE(row[1] == "B"); 31 | REQUIRE(row[2] == "C"); 32 | REQUIRE(row.size() == 3); 33 | 34 | rows.pop_front(); 35 | row = rows.front(); 36 | REQUIRE(row[0] == "123"); 37 | REQUIRE(row[1] == "234"); 38 | REQUIRE(row[2] == "345"); 39 | REQUIRE(row.size() == 3); 40 | 41 | rows.pop_front(); 42 | row = rows.front(); 43 | REQUIRE(row[0] == "1"); 44 | REQUIRE(row[1] == "2"); 45 | REQUIRE(row[2] == "3"); 46 | REQUIRE(row.size() == 3); 47 | 48 | rows.pop_front(); 49 | row = rows.front(); 50 | REQUIRE(row[0] == "1"); 51 | REQUIRE(row[1] == "2"); 52 | REQUIRE(row[2] == "3"); 53 | REQUIRE(row.size() == 3); 54 | } 55 | 56 | TEST_CASE("Test Quote Escapes", "[test_parse_quote_escape]") { 57 | std::stringstream csv("" 58 | "\"A\",\"B\",\"C\"\r\n" // Quoted fields w/ no escapes 59 | "123,\"234,345\",456\r\n" // Escaped comma 60 | "1,\"2\"\"3\",4\r\n" // Escaped quote 61 | "1,\"23\"\"34\",5\r\n" // Another escaped quote 62 | "1,\"\",2\r\n"); // Empty Field 63 | 64 | RowCollectionTest rows; 65 | 66 | StreamParser parser( 67 | csv, 68 | internals::make_parse_flags(',', '"'), 69 | internals::WhitespaceMap() 70 | ); 71 | 72 | parser.set_output(rows); 73 | parser.next(); 74 | 75 | auto row = rows.front(); 76 | REQUIRE(row[0] == "A"); 77 | REQUIRE(row[1] == "B"); 78 | REQUIRE(row[2] == "C"); 79 | REQUIRE(row.size() == 3); 80 | 81 | rows.pop_front(); 82 | row = rows.front(); 83 | REQUIRE(row[0] == "123"); 84 | REQUIRE(row[1] == "234,345"); 85 | REQUIRE(row[2] == "456"); 86 | REQUIRE(row.size() == 3); 87 | 88 | rows.pop_front(); 89 | row = rows.front(); 90 | REQUIRE(row[0] == "1"); 91 | REQUIRE(row[1] == "2\"3"); 92 | REQUIRE(row[2] == "4"); 93 | REQUIRE(row.size() == 3); 94 | 95 | rows.pop_front(); 96 | row = rows.front(); 97 | REQUIRE(row[0] == "1"); 98 | REQUIRE(row[1] == "23\"34"); 99 | REQUIRE(row[2] == "5"); 100 | REQUIRE(row.size() == 3); 101 | 102 | rows.pop_front(); 103 | row = rows.front(); 104 | REQUIRE(row[0] == "1"); 105 | REQUIRE(row[1] == ""); 106 | REQUIRE(row[2] == "2"); 107 | REQUIRE(row.size() == 3); 108 | } 109 | 110 | inline std::vector make_whitespace_test_cases() { 111 | std::vector test_cases = {}; 112 | std::stringstream ss; 113 | 114 | ss << "1, two,3" << std::endl 115 | << "4, ,5" << std::endl 116 | << " ,6, " << std::endl 117 | << "7,8,9 " << std::endl; 118 | test_cases.push_back(ss.str()); 119 | ss.clear(); 120 | 121 | // Lots of Whitespace 122 | ss << "1, two,3" << std::endl 123 | << "4, ,5" << std::endl 124 | << " ,6, " << std::endl 125 | << "7,8,9 " << std::endl; 126 | test_cases.push_back(ss.str()); 127 | ss.clear(); 128 | 129 | // Same as above but there's whitespace around 6 130 | ss << "1, two,3" << std::endl 131 | << "4, ,5" << std::endl 132 | << " , 6 , " << std::endl 133 | << "7,8,9 " << std::endl; 134 | test_cases.push_back(ss.str()); 135 | ss.clear(); 136 | 137 | // Tabs 138 | ss << "1, two,3" << std::endl 139 | << "4, \t ,5" << std::endl 140 | << "\t\t\t\t\t ,6, \t " << std::endl 141 | << "7,8,9 " << std::endl; 142 | test_cases.push_back(ss.str()); 143 | ss.clear(); 144 | 145 | return test_cases; 146 | } 147 | 148 | TEST_CASE("Test Parser Whitespace Trimming", "[test_csv_trim]") { 149 | auto row_str = GENERATE(as {}, 150 | "A,B,C\r\n" // Header row 151 | "123,\"234\n,345\",456\r\n", 152 | 153 | // Random spaces 154 | "A,B,C\r\n" 155 | " 123,\"234\n,345\", 456\r\n", 156 | 157 | // Random spaces + tabs 158 | "A,B,C\r\n" 159 | "\t\t 123,\"234\n,345\", 456\r\n", 160 | 161 | // Spaces in quote escaped field 162 | "A,B,C\r\n" 163 | "\t\t 123,\" 234\n,345 \t\", 456\r\n", 164 | 165 | // Spaces in one header column 166 | "A,B, C\r\n" 167 | "123,\"234\n,345\",456\r\n", 168 | 169 | // Random spaces + tabs in header 170 | "\t A, B\t, C\r\n" 171 | "123,\"234\n,345\",456\r\n", 172 | 173 | // Random spaces in header + data 174 | "A,B, C\r\n" 175 | "123,\"234\n,345\", 456\r\n" 176 | ); 177 | 178 | SECTION("Parse Test") { 179 | using namespace std; 180 | 181 | RowCollectionTest rows; 182 | 183 | auto csv = std::stringstream(row_str); 184 | StreamParser parser( 185 | csv, 186 | internals::make_parse_flags(',', '"'), 187 | internals::make_ws_flags({ ' ', '\t' }) 188 | ); 189 | 190 | parser.set_output(rows); 191 | parser.next(); 192 | 193 | auto header = rows[0]; 194 | REQUIRE(vector(header) == vector( 195 | { "A", "B", "C" })); 196 | 197 | auto row = rows[1]; 198 | REQUIRE(vector(row) == 199 | vector({ "123", "234\n,345", "456" })); 200 | REQUIRE(row[0] == "123"); 201 | REQUIRE(row[1] == "234\n,345"); 202 | REQUIRE(row[2] == "456"); 203 | } 204 | } 205 | 206 | TEST_CASE("Test Parser Whitespace Trimming w/ Empty Fields", "[test_raw_ws_trim]") { 207 | auto csv_string = GENERATE(from_range(make_whitespace_test_cases())); 208 | 209 | SECTION("Parse Test") { 210 | RowCollectionTest rows; 211 | 212 | auto csv = std::stringstream(csv_string); 213 | StreamParser parser( 214 | csv, 215 | internals::make_parse_flags(',', '"'), 216 | internals::make_ws_flags({ ' ', '\t' }) 217 | ); 218 | 219 | parser.set_output(rows); 220 | 221 | parser.next(); 222 | 223 | size_t row_no = 0; 224 | for (auto& row : rows) { 225 | switch (row_no) { 226 | case 0: 227 | REQUIRE(row[0].get() == 1); 228 | REQUIRE(row[1].get() == "two"); 229 | REQUIRE(row[2].get() == 3); 230 | break; 231 | 232 | case 1: 233 | REQUIRE(row[0].get() == 4); 234 | REQUIRE(row[1].is_null()); 235 | REQUIRE(row[2].get() == 5); 236 | break; 237 | 238 | case 2: 239 | REQUIRE(row[0].is_null()); 240 | REQUIRE(row[1].get() == 6); 241 | REQUIRE(row[2].is_null()); 242 | break; 243 | 244 | case 3: 245 | REQUIRE(row[0].get() == 7); 246 | REQUIRE(row[1].get() == 8); 247 | REQUIRE(row[2].get() == 9); 248 | break; 249 | } 250 | 251 | row_no++; 252 | } 253 | } 254 | } 255 | -------------------------------------------------------------------------------- /include/internal/common.hpp: -------------------------------------------------------------------------------- 1 | /** @file 2 | * A standalone header file containing shared code 3 | */ 4 | 5 | #pragma once 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #if defined(_WIN32) 13 | # ifndef WIN32_LEAN_AND_MEAN 14 | # define WIN32_LEAN_AND_MEAN 15 | # endif 16 | # include 17 | # undef max 18 | # undef min 19 | #elif defined(__linux__) 20 | # include 21 | #endif 22 | 23 | /** Helper macro which should be #defined as "inline" 24 | * in the single header version 25 | */ 26 | #define CSV_INLINE 27 | 28 | #pragma once 29 | #include 30 | 31 | #include "../external/string_view.hpp" 32 | 33 | // If there is another version of Hedley, then the newer one 34 | // takes precedence. 35 | // See: https://github.com/nemequ/hedley 36 | #include "../external/hedley.h" 37 | 38 | namespace csv { 39 | #ifdef _MSC_VER 40 | #pragma region Compatibility Macros 41 | #endif 42 | /** 43 | * @def IF_CONSTEXPR 44 | * Expands to `if constexpr` in C++17 and `if` otherwise 45 | * 46 | * @def CONSTEXPR_VALUE 47 | * Expands to `constexpr` in C++17 and `const` otherwise. 48 | * Mainly used for global variables. 49 | * 50 | * @def CONSTEXPR 51 | * Expands to `constexpr` in decent compilers and `inline` otherwise. 52 | * Intended for functions and methods. 53 | */ 54 | 55 | #define STATIC_ASSERT(x) static_assert(x, "Assertion failed") 56 | 57 | #if (defined(CMAKE_CXX_STANDARD) && CMAKE_CXX_STANDARD == 20) || __cplusplus >= 202002L 58 | #define CSV_HAS_CXX20 59 | #endif 60 | 61 | #if (defined(CMAKE_CXX_STANDARD) && CMAKE_CXX_STANDARD == 17) || __cplusplus >= 201703L 62 | #define CSV_HAS_CXX17 63 | #endif 64 | 65 | #if (defined(CMAKE_CXX_STANDARD) && CMAKE_CXX_STANDARD >= 14) || __cplusplus >= 201402L 66 | #define CSV_HAS_CXX14 67 | #endif 68 | 69 | #ifdef CSV_HAS_CXX17 70 | #include 71 | /** @typedef string_view 72 | * The string_view class used by this library. 73 | */ 74 | using string_view = std::string_view; 75 | #else 76 | /** @typedef string_view 77 | * The string_view class used by this library. 78 | */ 79 | using string_view = nonstd::string_view; 80 | #endif 81 | 82 | #ifdef CSV_HAS_CXX17 83 | #define IF_CONSTEXPR if constexpr 84 | #define CONSTEXPR_VALUE constexpr 85 | 86 | #define CONSTEXPR_17 constexpr 87 | #else 88 | #define IF_CONSTEXPR if 89 | #define CONSTEXPR_VALUE const 90 | 91 | #define CONSTEXPR_17 inline 92 | #endif 93 | 94 | #ifdef CSV_HAS_CXX14 95 | template 96 | using enable_if_t = std::enable_if_t; 97 | 98 | #define CONSTEXPR_14 constexpr 99 | #define CONSTEXPR_VALUE_14 constexpr 100 | #else 101 | template 102 | using enable_if_t = typename std::enable_if::type; 103 | 104 | #define CONSTEXPR_14 inline 105 | #define CONSTEXPR_VALUE_14 const 106 | #endif 107 | 108 | // Resolves g++ bug with regard to constexpr methods 109 | // See: https://stackoverflow.com/questions/36489369/constexpr-non-static-member-function-with-non-constexpr-constructor-gcc-clang-d 110 | #if defined __GNUC__ && !defined __clang__ 111 | #if (__GNUC__ >= 7 &&__GNUC_MINOR__ >= 2) || (__GNUC__ >= 8) 112 | #define CONSTEXPR constexpr 113 | #endif 114 | #else 115 | #ifdef CSV_HAS_CXX17 116 | #define CONSTEXPR constexpr 117 | #endif 118 | #endif 119 | 120 | #ifndef CONSTEXPR 121 | #define CONSTEXPR inline 122 | #endif 123 | 124 | #ifdef _MSC_VER 125 | #pragma endregion 126 | #endif 127 | 128 | namespace internals { 129 | // PAGE_SIZE macro could be already defined by the host system. 130 | #if defined(PAGE_SIZE) 131 | #undef PAGE_SIZE 132 | #endif 133 | 134 | // Get operating system specific details 135 | #if defined(_WIN32) 136 | inline int getpagesize() { 137 | _SYSTEM_INFO sys_info = {}; 138 | GetSystemInfo(&sys_info); 139 | return std::max(sys_info.dwPageSize, sys_info.dwAllocationGranularity); 140 | } 141 | 142 | const int PAGE_SIZE = getpagesize(); 143 | #elif defined(__linux__) 144 | const int PAGE_SIZE = getpagesize(); 145 | #else 146 | /** Size of a memory page in bytes. Used by 147 | * csv::internals::CSVFieldArray when allocating blocks. 148 | */ 149 | const int PAGE_SIZE = 4096; 150 | #endif 151 | 152 | /** For functions that lazy load a large CSV, this determines how 153 | * many bytes are read at a time 154 | */ 155 | constexpr size_t ITERATION_CHUNK_SIZE = 10000000; // 10MB 156 | 157 | template 158 | inline bool is_equal(T a, T b, T epsilon = 0.001) { 159 | /** Returns true if two floating point values are about the same */ 160 | static_assert(std::is_floating_point::value, "T must be a floating point type."); 161 | return std::abs(a - b) < epsilon; 162 | } 163 | 164 | /** @typedef ParseFlags 165 | * An enum used for describing the significance of each character 166 | * with respect to CSV parsing 167 | * 168 | * @see quote_escape_flag 169 | */ 170 | enum class ParseFlags { 171 | QUOTE_ESCAPE_QUOTE = 0, /**< A quote inside or terminating a quote_escaped field */ 172 | QUOTE = 2 | 1, /**< Characters which may signify a quote escape */ 173 | NOT_SPECIAL = 4, /**< Characters with no special meaning or escaped delimiters and newlines */ 174 | DELIMITER = 4 | 2, /**< Characters which signify a new field */ 175 | NEWLINE = 4 | 2 | 1 /**< Characters which signify a new row */ 176 | }; 177 | 178 | /** Transform the ParseFlags given the context of whether or not the current 179 | * field is quote escaped */ 180 | constexpr ParseFlags quote_escape_flag(ParseFlags flag, bool quote_escape) noexcept { 181 | return (ParseFlags)((int)flag & ~((int)ParseFlags::QUOTE * quote_escape)); 182 | } 183 | 184 | // Assumed to be true by parsing functions: allows for testing 185 | // if an item is DELIMITER or NEWLINE with a >= statement 186 | STATIC_ASSERT(ParseFlags::DELIMITER < ParseFlags::NEWLINE); 187 | 188 | /** Optimizations for reducing branching in parsing loop 189 | * 190 | * Idea: The meaning of all non-quote characters changes depending 191 | * on whether or not the parser is in a quote-escaped mode (0 or 1) 192 | */ 193 | STATIC_ASSERT(quote_escape_flag(ParseFlags::NOT_SPECIAL, false) == ParseFlags::NOT_SPECIAL); 194 | STATIC_ASSERT(quote_escape_flag(ParseFlags::QUOTE, false) == ParseFlags::QUOTE); 195 | STATIC_ASSERT(quote_escape_flag(ParseFlags::DELIMITER, false) == ParseFlags::DELIMITER); 196 | STATIC_ASSERT(quote_escape_flag(ParseFlags::NEWLINE, false) == ParseFlags::NEWLINE); 197 | 198 | STATIC_ASSERT(quote_escape_flag(ParseFlags::NOT_SPECIAL, true) == ParseFlags::NOT_SPECIAL); 199 | STATIC_ASSERT(quote_escape_flag(ParseFlags::QUOTE, true) == ParseFlags::QUOTE_ESCAPE_QUOTE); 200 | STATIC_ASSERT(quote_escape_flag(ParseFlags::DELIMITER, true) == ParseFlags::NOT_SPECIAL); 201 | STATIC_ASSERT(quote_escape_flag(ParseFlags::NEWLINE, true) == ParseFlags::NOT_SPECIAL); 202 | 203 | /** An array which maps ASCII chars to a parsing flag */ 204 | using ParseFlagMap = std::array; 205 | 206 | /** An array which maps ASCII chars to a flag indicating if it is whitespace */ 207 | using WhitespaceMap = std::array; 208 | } 209 | 210 | /** Integer indicating a requested column wasn't found. */ 211 | constexpr int CSV_NOT_FOUND = -1; 212 | 213 | /** Offset to convert char into array index. */ 214 | constexpr unsigned CHAR_OFFSET = std::numeric_limits::is_signed ? 128 : 0; 215 | } 216 | -------------------------------------------------------------------------------- /single_header.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import os 3 | import re 4 | 5 | CPP_SEP = '/' 6 | Include = namedtuple('Include', ['path', 'line_no']) 7 | 8 | ''' Represents a file path ''' 9 | class Path(list): 10 | def __init__(self, *args): 11 | super().__init__() 12 | 13 | if (len(args) > 0 and type(args[0]) is list): 14 | for p in args[0]: 15 | self.append(p) 16 | else: 17 | for p in args: 18 | self.append(p) 19 | 20 | def append(self, sub: str): 21 | separated = sub.split(os.sep) 22 | if (len(separated) == 1): 23 | separated = sub.split(CPP_SEP) 24 | 25 | for i in separated: 26 | if (i == '..'): 27 | # Go up a path 28 | self.pop() 29 | else: 30 | super().append(i) 31 | 32 | def copy(self): 33 | temp = Path() 34 | for i in self: 35 | temp.append(i) 36 | return temp 37 | 38 | def join(self, sub: str): 39 | temp = self.copy() 40 | temp.append(sub) 41 | return temp 42 | 43 | ''' Return the first element of the path ''' 44 | def dirname(self) -> str: 45 | try: 46 | return self[0] 47 | except IndexError: 48 | return '' 49 | 50 | def ext(self) -> str: 51 | try: 52 | return self[-1].split('.')[-1] 53 | except IndexError: 54 | return '' 55 | 56 | def __str__(self): 57 | if (len(self) == 1): 58 | return self[0] + '/' 59 | 60 | return '/'.join(self) 61 | 62 | def __hash__(self): 63 | return hash(str(self)) 64 | 65 | def header_list(files: list) -> list: 66 | ''' 67 | Given a list of files, compute the list of header files in the order in which they should 68 | be included to avoid conflicts 69 | ''' 70 | 71 | dependencies = {} 72 | headers = [] 73 | 74 | ''' Iterate over every .cpp and .hpp file ''' 75 | for file in files: 76 | file_ext = file.ext() 77 | if (file_ext == 'hpp' or file_ext == 'h'): 78 | dependencies[file] = [d.path for d in get_dependencies(file)['local']] 79 | 80 | while dependencies: 81 | for file in list(dependencies.keys()): 82 | # Remove includes we've already included 83 | dependencies[file] = [i for i in dependencies[file] if i not in headers] 84 | 85 | # If no more dependencies, add file 86 | if not dependencies[file]: 87 | headers.append(file) 88 | dependencies.pop(file) 89 | 90 | return headers 91 | 92 | def get_dependencies(file: Path) -> dict: 93 | ''' Parse a .cpp/.hpp file for its system and local dependencies ''' 94 | 95 | dir = Path(file[:-1]) 96 | 97 | headers = { 98 | "system": [], 99 | "local": [] 100 | } 101 | 102 | with open(str(file), mode='r') as infile: 103 | for i, line in enumerate(infile): 104 | sys_include = re.search('^#include <(?P.*)>', line) 105 | local_include = re.search('^#include "(?P.*)"', line) 106 | if sys_include: 107 | headers["system"].append( 108 | Include(path=sys_include.group('file'), line_no=i)) 109 | elif local_include: 110 | headers["local"].append( 111 | Include(path=dir.join(local_include.group('file')), line_no=i)) 112 | 113 | return headers 114 | 115 | ''' Strip local include statements and #pragma once declarations from source files ''' 116 | def file_strip(file: Path) -> str: 117 | new_file = '' 118 | strip_these = ['#include "(?P.*)"', '#pragma once' ] 119 | 120 | # Strip out pragma once 121 | with open(str(file), mode='r') as infile: 122 | for line in infile: 123 | add_this_line = sum(re.search(strip, line) is not None for strip in strip_these) == 0 124 | 125 | # Change "#define CSV_INLINE" to "#define CSV_INLINE inline" 126 | if ('#define CSV_INLINE' in line): 127 | line = "#define CSV_INLINE inline\n" 128 | 129 | if (add_this_line): 130 | new_file += line 131 | 132 | return new_file 133 | 134 | ''' 135 | Collate header files by using this following algorithm: 136 | 137 | - Given a list of header files (HEADERS) ordered such that the first file 138 | has no internal dependencies, and the last file is the most dependent 139 | - Reverse the list 140 | - Maintain these data structures: 141 | - A set of header files (PROCESSED) that were processed 142 | - A set of header files (MISSING_INCLUDES) that we are looking for 143 | - The collation of header source code (HEADER_CONCAT) 144 | - Go through each FILE in list of headers in reverse order (starting with 145 | the headers at the highest level of the dependency tree) 146 | - If FILE is not in MISSING_INCLUDES, then concatenate source verbatim to HEADER_CONCAT 147 | - Otherwise, there is one or more #include statements in HEADER_CONCAT which references FILE 148 | - Replace the first #include statement with the source of FILE, and remove the rest 149 | ''' 150 | def header_collate(headers: list): 151 | headers.reverse() 152 | 153 | # Placeholder for includes to be inserted 154 | splice_template = "__INSERT_HEADER_HERE__({})\n" 155 | header_concat = '' 156 | processed = set() 157 | missing_includes = set() 158 | 159 | def process_file(path: Path): 160 | source = '' 161 | 162 | with open(str(path), mode='r') as infile: 163 | for line in infile: 164 | # Add local includes to MISSING_INCLUDES 165 | local_include = re.search('^#include "(?P.*)"', line) 166 | if local_include: 167 | dir = Path(path[:-1]) 168 | include_path = dir.join(local_include.group('file')) 169 | 170 | if str(include_path) not in processed: 171 | missing_includes.add(str(include_path)) 172 | source += splice_template.format(str(include_path)) 173 | elif '#pragma once' in line: 174 | continue 175 | else: 176 | source += line 177 | 178 | return source 179 | 180 | for path in headers: 181 | processed.add(str(path)) 182 | 183 | if str(path) in missing_includes: 184 | source = process_file(path) 185 | splice_phrase = splice_template.format(str(path)) 186 | header_concat = header_concat.replace( 187 | splice_phrase, 188 | source + '\n', 1) 189 | header_concat = header_concat.replace(splice_phrase, '') 190 | 191 | missing_includes.remove(str(path)) 192 | else: 193 | header_concat += process_file(path) 194 | 195 | return header_concat 196 | 197 | if __name__ == "__main__": 198 | ''' Iterate over every .cpp and .hpp file ''' 199 | headers = [] 200 | sources = [] 201 | system_includes = set() 202 | 203 | # Generate a list of header and source file locations 204 | for dir in os.walk('include'): 205 | files = dir[2] 206 | 207 | for file in files: 208 | fname = Path(dir[0], file) 209 | 210 | if (file[-4:] == '.hpp' or file[-2:] == '.h'): 211 | headers.append(fname) 212 | elif (file[-4:] == '.cpp'): 213 | sources.append(fname) 214 | 215 | # Rearrange header order to avoid compilation conflicts 216 | headers = header_list(sorted(headers)) 217 | 218 | # Get system includes 219 | for file in sources + headers: 220 | for include in get_dependencies(file)['system']: 221 | system_includes.add(include.path) 222 | 223 | # Collate header and source files 224 | header_concat = header_collate(headers) 225 | source_collate = '' 226 | 227 | for cpp in sources: 228 | source_collate += file_strip(cpp) + '\n' 229 | 230 | # Generate hpp file 231 | print("#pragma once") 232 | print(header_concat.replace( 233 | "#define CSV_INLINE", "#define CSV_INLINE inline").replace( 234 | "/** INSERT_CSV_SOURCES **/", source_collate)) -------------------------------------------------------------------------------- /python/csvpy.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "csv.hpp" 8 | namespace py = pybind11; 9 | using namespace pybind11::literals; 10 | using namespace csv; 11 | 12 | void init_CSVFormat(py::module& m){ 13 | py::class_(m, "Format") 14 | .def(py::init<>()) 15 | .def("delimiter", 16 | py::overload_cast&>(&CSVFormat::delimiter), 17 | "Sets a list of potential delimiters.", 18 | py::arg("delim")) 19 | .def("delimiter", 20 | py::overload_cast(&CSVFormat::delimiter), 21 | "Sets the delimiter of the CSV file.", 22 | py::arg("delim")) 23 | 24 | .def("trim", 25 | &CSVFormat::trim, 26 | "Sets the whitespace characters to be trimmed", 27 | py::arg("ws")) 28 | 29 | .def("quote", 30 | py::overload_cast(&CSVFormat::quote), 31 | "Sets the quote character", 32 | py::arg("quote")) 33 | 34 | .def("quote", 35 | py::overload_cast(&CSVFormat::quote), 36 | "Turn quoting on or off", 37 | py::arg("use_quote")) 38 | 39 | .def("column_names", 40 | &CSVFormat::column_names, 41 | "Sets the column names.", 42 | py::arg("names")) 43 | 44 | .def("header_row", 45 | &CSVFormat::header_row, 46 | "Sets the header row", 47 | py::arg("row")) 48 | .def("no_header", 49 | &CSVFormat::no_header, 50 | "Tells the parser that this CSV has no header row") 51 | .def("is_quoting_enabled", 52 | &CSVFormat::is_quoting_enabled) 53 | .def("get_quote_char", 54 | &CSVFormat::get_quote_char) 55 | .def("get_header", &CSVFormat::get_header) 56 | .def("get_possible_delims", 57 | &CSVFormat::get_possible_delims) 58 | .def("get_trim_chars", 59 | &CSVFormat::get_trim_chars); 60 | } 61 | 62 | void init_CSVReader(py::module& m){ 63 | py::class_(m, "Reader") 64 | .def(py::init(), 65 | "filename"_a, 66 | "format"_a=CSVFormat::guess_csv()) 67 | .def("eof", 68 | &CSVReader::eof, 69 | "Returns true if we have reached end of file") 70 | .def("get_format", 71 | &CSVReader::get_format) 72 | .def("empty", 73 | &CSVReader::empty) 74 | .def("n_rows", 75 | &CSVReader::n_rows, 76 | "Retrieves the number of rows that have been read so far") 77 | .def("utf8_bom", 78 | &CSVReader::utf8_bom, 79 | "Whether or not CSV was prefixed with a UTF-8 bom") 80 | .def("__iter__", 81 | [](CSVReader& reader){return py::make_iterator(reader.begin(), reader.end());}, 82 | py::keep_alive<0, 1>()); 83 | } 84 | 85 | void init_CSVRow(py::module& m){ 86 | py::class_(m, "Row") 87 | .def(py::init<>()) 88 | .def("empty", 89 | &CSVRow::empty, 90 | "Indicates whether row is empty or not") 91 | .def("size", 92 | &CSVRow::size, 93 | "Return the number of fields in this row") 94 | 95 | .def("get_col_names", 96 | &CSVRow::get_col_names, 97 | "Retrieve this row's associated column names") 98 | 99 | .def("to_json", &CSVRow::to_json, "subset"_a=std::vector{}) 100 | 101 | .def("to_json_array", &CSVRow::to_json_array, "subset"_a=std::vector{}) 102 | 103 | .def("__getitem__", [](const CSVRow& row, size_t idx){ 104 | if(idx >= row.size()){ 105 | throw py::index_error("index out of range"); 106 | } 107 | return row[idx]; 108 | }, py::is_operator()) 109 | 110 | .def("__getitem__", [](const CSVRow& row, std::string col_name){ 111 | auto column_names = row.get_col_names(); 112 | auto it = std::find(column_names.begin(), column_names.end(), col_name); 113 | if (it != column_names.end()){ 114 | return row[it - column_names.begin()]; 115 | }else{ 116 | throw py::index_error("Can't find a column named " + col_name); 117 | } 118 | }, py::is_operator()); 119 | } 120 | 121 | void init_DataType(py::module& m){ 122 | py::enum_(m, 123 | "DataType", 124 | py::arithmetic(), 125 | "Enumerates the different CSV field types that are recognized by this library") 126 | .value("UNKNOWN" ,DataType::UNKNOWN) 127 | .value("CSV_NULL", DataType::CSV_NULL, "Empty string") 128 | .value("CSV_STRING", DataType::CSV_STRING, "Non-numeric string") 129 | .value("CSV_INT8", DataType::CSV_INT8, "8-bit integer") 130 | .value("CSV_INT16", DataType::CSV_INT16, "16-bit integer") 131 | .value("CSV_INT32", DataType::CSV_INT32, "32-bit integer") 132 | .value("CSV_INT64", DataType::CSV_INT64, "64-bit integer") 133 | .value("CSV_DOUBLE", DataType::CSV_DOUBLE, "Floating point value"); 134 | } 135 | 136 | void init_CSVField(py::module& m){ 137 | py::class_(m, "Field") 138 | .def(py::init()) 139 | .def("is_null", 140 | &CSVField::is_null, 141 | "Returns true if field is an empty string or string of whitespace characters") 142 | .def("get_sv", 143 | &CSVField::get_sv, 144 | "Return a string view over the field's contents") 145 | .def("is_str", 146 | &CSVField::is_str, 147 | "Returns true if field is a non-numeric, non-empty string") 148 | .def("is_num", 149 | &CSVField::is_num, 150 | "Returns true if field is an integer or float") 151 | .def("is_int", 152 | &CSVField::is_int, 153 | "Returns true if field is an integer") 154 | .def("is_float", 155 | &CSVField::is_float, 156 | "Returns true if field is a floating point value") 157 | .def("type", 158 | &CSVField::type, 159 | "Return the type of the underlying CSV data") 160 | .def("get_int", &CSVField::get) 161 | .def("get_str", &CSVField::get) 162 | .def("get_double", &CSVField::get) 163 | .def("get_float", &CSVField::get); 164 | } 165 | 166 | void init_CSVUtility(py::module& m){ 167 | py::class_(m, "CSVFileInfo") 168 | .def_readonly("filename",&CSVFileInfo::filename) 169 | .def_readonly("col_names", &CSVFileInfo::col_names) 170 | .def_readonly("delim", &CSVFileInfo::delim) 171 | .def_readonly("n_rows", &CSVFileInfo::n_rows) 172 | .def_readonly("n_cols", &CSVFileInfo::n_cols); 173 | 174 | m.def("parse", 175 | &parse, 176 | "Shorthand function for parsing an in-memory CSV string", 177 | py::arg("in"), py::arg("format")) 178 | .def("parse_no_header", 179 | &parse_no_header, 180 | "Parses a CSV string with no headers", 181 | py::arg("in")) 182 | .def("get_col_pos", 183 | &get_col_pos, 184 | "Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise", 185 | py::arg("filename"), 186 | py::arg("col_name"), 187 | py::arg("format")) 188 | .def("get_file_info", 189 | &get_file_info, 190 | "Get basic information about a CSV file", 191 | py::arg("filename")) 192 | .def("csv_data_types", 193 | &csv_data_types, 194 | "Return a data type for each column such that every value in a column can be converted to the corresponding data type without data loss.", 195 | py::arg("filename")); 196 | } 197 | 198 | void init_CSVStat(py::module& m){ 199 | py::class_(m, "CSVStat") 200 | .def(py::init(), 201 | "filename"_a, 202 | "format"_a=CSVFormat::guess_csv()) 203 | .def("get_mean", 204 | &CSVStat::get_mean, 205 | "Return current means") 206 | .def("get_variance", 207 | &CSVStat::get_variance, 208 | "Return current variances") 209 | .def("get_mins", 210 | &CSVStat::get_mins, 211 | "Return current mins") 212 | .def("get_maxes", 213 | &CSVStat::get_maxes, 214 | "Return current maxes") 215 | .def("get_counts", 216 | &CSVStat::get_counts, 217 | "Get counts for each column") 218 | .def("get_dtypes", 219 | &CSVStat::get_dtypes, 220 | "Get data type counts for each column") 221 | .def("get_col_names", 222 | &CSVStat::get_col_names, 223 | "Return the CSV's column names as a List of strings."); 224 | } 225 | 226 | PYBIND11_MODULE(csvpy, m){ 227 | m.doc() = "A modern C++ library for reading, writing, and analyzing CSV (and similar) files."; 228 | init_CSVFormat(m); 229 | init_CSVReader(m); 230 | init_CSVRow(m); 231 | init_DataType(m); 232 | init_CSVField(m); 233 | init_CSVUtility(m); 234 | init_CSVStat(m); 235 | } -------------------------------------------------------------------------------- /include/internal/csv_row_json.cpp: -------------------------------------------------------------------------------- 1 | /** @file 2 | * Implements JSON serialization abilities 3 | */ 4 | 5 | #include "csv_row.hpp" 6 | 7 | namespace csv { 8 | /* 9 | The implementations for json_extra_space() and json_escape_string() 10 | were modified from source code for JSON for Modern C++. 11 | 12 | The respective license is below: 13 | 14 | The code is licensed under the [MIT 15 | License](http://opensource.org/licenses/MIT): 16 | 17 | Copyright © 2013-2015 Niels Lohmann. 18 | 19 | Permission is hereby granted, free of charge, to any person 20 | obtaining a copy of this software and associated documentation files 21 | (the "Software"), to deal in the Software without restriction, 22 | including without limitation the rights to use, copy, modify, merge, 23 | publish, distribute, sublicense, and/or sell copies of the Software, 24 | and to permit persons to whom the Software is furnished to do so, 25 | subject to the following conditions: 26 | 27 | The above copyright notice and this permission notice shall be 28 | included in all copies or substantial portions of the Software. 29 | 30 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37 | SOFTWARE. 38 | */ 39 | 40 | namespace internals { 41 | /*! 42 | @brief calculates the extra space to escape a JSON string 43 | 44 | @param[in] s the string to escape 45 | @return the number of characters required to escape string @a s 46 | 47 | @complexity Linear in the length of string @a s. 48 | */ 49 | static std::size_t json_extra_space(csv::string_view& s) noexcept 50 | { 51 | std::size_t result = 0; 52 | 53 | 54 | for (const auto& c : s) 55 | { 56 | switch (c) 57 | { 58 | case '"': 59 | case '\\': 60 | case '\b': 61 | case '\f': 62 | case '\n': 63 | case '\r': 64 | case '\t': 65 | { 66 | // from c (1 byte) to \x (2 bytes) 67 | result += 1; 68 | break; 69 | } 70 | 71 | 72 | default: 73 | { 74 | if (c >= 0x00 && c <= 0x1f) 75 | { 76 | // from c (1 byte) to \uxxxx (6 bytes) 77 | result += 5; 78 | } 79 | break; 80 | } 81 | } 82 | } 83 | 84 | 85 | return result; 86 | } 87 | 88 | CSV_INLINE std::string json_escape_string(csv::string_view s) noexcept 89 | { 90 | const auto space = json_extra_space(s); 91 | if (space == 0) 92 | { 93 | return std::string(s); 94 | } 95 | 96 | // create a result string of necessary size 97 | size_t result_size = s.size() + space; 98 | std::string result(result_size, '\\'); 99 | std::size_t pos = 0; 100 | 101 | for (const auto& c : s) 102 | { 103 | switch (c) 104 | { 105 | // quotation mark (0x22) 106 | case '"': 107 | { 108 | result[pos + 1] = '"'; 109 | pos += 2; 110 | break; 111 | } 112 | 113 | 114 | // reverse solidus (0x5c) 115 | case '\\': 116 | { 117 | // nothing to change 118 | pos += 2; 119 | break; 120 | } 121 | 122 | 123 | // backspace (0x08) 124 | case '\b': 125 | { 126 | result[pos + 1] = 'b'; 127 | pos += 2; 128 | break; 129 | } 130 | 131 | 132 | // formfeed (0x0c) 133 | case '\f': 134 | { 135 | result[pos + 1] = 'f'; 136 | pos += 2; 137 | break; 138 | } 139 | 140 | 141 | // newline (0x0a) 142 | case '\n': 143 | { 144 | result[pos + 1] = 'n'; 145 | pos += 2; 146 | break; 147 | } 148 | 149 | 150 | // carriage return (0x0d) 151 | case '\r': 152 | { 153 | result[pos + 1] = 'r'; 154 | pos += 2; 155 | break; 156 | } 157 | 158 | 159 | // horizontal tab (0x09) 160 | case '\t': 161 | { 162 | result[pos + 1] = 't'; 163 | pos += 2; 164 | break; 165 | } 166 | 167 | 168 | default: 169 | { 170 | if (c >= 0x00 && c <= 0x1f) 171 | { 172 | // print character c as \uxxxx 173 | snprintf(&result[pos + 1], result_size - pos - 1, "u%04x", int(c)); 174 | pos += 6; 175 | // overwrite trailing null character 176 | result[pos] = '\\'; 177 | } 178 | else 179 | { 180 | // all other characters are added as-is 181 | result[pos++] = c; 182 | } 183 | break; 184 | } 185 | } 186 | } 187 | 188 | return result; 189 | } 190 | } 191 | 192 | /** Convert a CSV row to a JSON object, i.e. 193 | * `{"col1":"value1","col2":"value2"}` 194 | * 195 | * @note All strings are properly escaped. Numeric values are not quoted. 196 | * @param[in] subset A subset of columns to contain in the JSON. 197 | * Leave empty for original columns. 198 | */ 199 | CSV_INLINE std::string CSVRow::to_json(const std::vector& subset) const { 200 | std::vector col_names = subset; 201 | if (subset.empty()) { 202 | col_names = this->data ? this->get_col_names() : std::vector({}); 203 | } 204 | 205 | const size_t _n_cols = col_names.size(); 206 | std::string ret = "{"; 207 | 208 | for (size_t i = 0; i < _n_cols; i++) { 209 | auto& col = col_names[i]; 210 | auto field = this->operator[](col); 211 | 212 | // TODO: Possible performance enhancements by caching escaped column names 213 | ret += '"' + internals::json_escape_string(col) + "\":"; 214 | 215 | // Add quotes around strings but not numbers 216 | if (field.is_num()) 217 | ret += internals::json_escape_string(field.get()); 218 | else 219 | ret += '"' + internals::json_escape_string(field.get()) + '"'; 220 | 221 | // Do not add comma after last string 222 | if (i + 1 < _n_cols) 223 | ret += ','; 224 | } 225 | 226 | ret += '}'; 227 | return ret; 228 | } 229 | 230 | /** Convert a CSV row to a JSON array, i.e. 231 | * `["value1","value2",...]` 232 | * 233 | * @note All strings are properly escaped. Numeric values are not quoted. 234 | * @param[in] subset A subset of columns to contain in the JSON. 235 | * Leave empty for all columns. 236 | */ 237 | CSV_INLINE std::string CSVRow::to_json_array(const std::vector& subset) const { 238 | std::vector col_names = subset; 239 | if (subset.empty()) 240 | col_names = this->data ? this->get_col_names() : std::vector({}); 241 | 242 | const size_t _n_cols = col_names.size(); 243 | std::string ret = "["; 244 | 245 | for (size_t i = 0; i < _n_cols; i++) { 246 | auto field = this->operator[](col_names[i]); 247 | 248 | // Add quotes around strings but not numbers 249 | if (field.is_num()) 250 | ret += internals::json_escape_string(field.get()); 251 | else 252 | ret += '"' + internals::json_escape_string(field.get()) + '"'; 253 | 254 | // Do not add comma after last string 255 | if (i + 1 < _n_cols) 256 | ret += ','; 257 | } 258 | 259 | ret += ']'; 260 | return ret; 261 | } 262 | } 263 | -------------------------------------------------------------------------------- /include/internal/csv_reader.hpp: -------------------------------------------------------------------------------- 1 | /** @file 2 | * @brief Defines functionality needed for basic CSV parsing 3 | */ 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include "../external/mio.hpp" 19 | #include "basic_csv_parser.hpp" 20 | #include "common.hpp" 21 | #include "data_type.hpp" 22 | #include "csv_format.hpp" 23 | 24 | /** The all encompassing namespace */ 25 | namespace csv { 26 | /** Stuff that is generally not of interest to end-users */ 27 | namespace internals { 28 | std::string format_row(const std::vector& row, csv::string_view delim = ", "); 29 | 30 | std::vector _get_col_names( csv::string_view head, const CSVFormat format = CSVFormat::guess_csv()); 31 | 32 | struct GuessScore { 33 | double score; 34 | size_t header; 35 | }; 36 | 37 | CSV_INLINE GuessScore calculate_score(csv::string_view head, const CSVFormat& format); 38 | 39 | CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims = { ',', '|', '\t', ';', '^', '~' }); 40 | } 41 | 42 | std::vector get_col_names( 43 | csv::string_view filename, 44 | const CSVFormat format = CSVFormat::guess_csv()); 45 | 46 | /** Guess the delimiter used by a delimiter-separated values file */ 47 | CSVGuessResult guess_format(csv::string_view filename, 48 | const std::vector& delims = { ',', '|', '\t', ';', '^', '~' }); 49 | 50 | /** @class CSVReader 51 | * @brief Main class for parsing CSVs from files and in-memory sources 52 | * 53 | * All rows are compared to the column names for length consistency 54 | * - By default, rows that are too short or too long are dropped 55 | * - Custom behavior can be defined by overriding bad_row_handler in a subclass 56 | */ 57 | class CSVReader { 58 | public: 59 | /** 60 | * An input iterator capable of handling large files. 61 | * @note Created by CSVReader::begin() and CSVReader::end(). 62 | * 63 | * @par Iterating over a file 64 | * @snippet tests/test_csv_iterator.cpp CSVReader Iterator 1 65 | * 66 | * @par Using with `` library 67 | * @snippet tests/test_csv_iterator.cpp CSVReader Iterator 2 68 | */ 69 | class iterator { 70 | public: 71 | #ifndef DOXYGEN_SHOULD_SKIP_THIS 72 | using value_type = CSVRow; 73 | using difference_type = std::ptrdiff_t; 74 | using pointer = CSVRow * ; 75 | using reference = CSVRow & ; 76 | using iterator_category = std::input_iterator_tag; 77 | #endif 78 | 79 | iterator() = default; 80 | iterator(CSVReader* reader) : daddy(reader) {}; 81 | iterator(CSVReader*, CSVRow&&); 82 | 83 | /** Access the CSVRow held by the iterator */ 84 | CONSTEXPR_14 reference operator*() { return this->row; } 85 | CONSTEXPR_14 reference operator*() const { return const_cast(this->row); } 86 | 87 | /** Return a pointer to the CSVRow the iterator has stopped at */ 88 | CONSTEXPR_14 pointer operator->() { return &(this->row); } 89 | CONSTEXPR_14 pointer operator->() const { return const_cast(&(this->row)); } 90 | 91 | iterator& operator++(); /**< Pre-increment iterator */ 92 | iterator operator++(int); /**< Post-increment iterator */ 93 | 94 | /** Returns true if iterators were constructed from the same CSVReader 95 | * and point to the same row 96 | */ 97 | CONSTEXPR bool operator==(const iterator& other) const noexcept { 98 | return (this->daddy == other.daddy) && (this->i == other.i); 99 | } 100 | 101 | CONSTEXPR bool operator!=(const iterator& other) const noexcept { return !operator==(other); } 102 | private: 103 | CSVReader * daddy = nullptr; // Pointer to parent 104 | CSVRow row; // Current row 105 | size_t i = 0; // Index of current row 106 | }; 107 | 108 | /** @name Constructors 109 | * Constructors for iterating over large files and parsing in-memory sources. 110 | */ 111 | ///@{ 112 | CSVReader(csv::string_view filename, CSVFormat format = CSVFormat::guess_csv()); 113 | 114 | /** Allows parsing stream sources such as `std::stringstream` or `std::ifstream` 115 | * 116 | * @tparam TStream An input stream deriving from `std::istream` 117 | * @note Currently this constructor requires special CSV dialects to be manually 118 | * specified. 119 | */ 120 | template::value, int> = 0> 122 | CSVReader(TStream &source, CSVFormat format = CSVFormat::guess_csv()) : _format(format) { 123 | auto head = internals::get_csv_head(source); 124 | using Parser = internals::StreamParser; 125 | 126 | if (format.guess_delim()) { 127 | auto guess_result = internals::_guess_format(head, format.possible_delimiters); 128 | format.delimiter(guess_result.delim); 129 | format.header = guess_result.header_row; 130 | this->_format = format; 131 | } 132 | 133 | if (!format.col_names.empty()) 134 | this->set_col_names(format.col_names); 135 | 136 | this->parser = std::unique_ptr( 137 | new Parser(source, format, col_names)); // For C++11 138 | this->initial_read(); 139 | } 140 | ///@} 141 | 142 | CSVReader(const CSVReader&) = delete; // No copy constructor 143 | CSVReader(CSVReader&&) = default; // Move constructor 144 | CSVReader& operator=(const CSVReader&) = delete; // No copy assignment 145 | CSVReader& operator=(CSVReader&& other) = default; 146 | ~CSVReader() { 147 | if (this->read_csv_worker.joinable()) { 148 | this->read_csv_worker.join(); 149 | } 150 | } 151 | 152 | /** @name Retrieving CSV Rows */ 153 | ///@{ 154 | bool read_row(CSVRow &row); 155 | iterator begin(); 156 | HEDLEY_CONST iterator end() const noexcept; 157 | 158 | /** Returns true if we have reached end of file */ 159 | bool eof() const noexcept { return this->parser->eof(); }; 160 | ///@} 161 | 162 | /** @name CSV Metadata */ 163 | ///@{ 164 | CSVFormat get_format() const; 165 | std::vector get_col_names() const; 166 | int index_of(csv::string_view col_name) const; 167 | ///@} 168 | 169 | /** @name CSV Metadata: Attributes */ 170 | ///@{ 171 | /** Whether or not the file or stream contains valid CSV rows, 172 | * not including the header. 173 | * 174 | * @note Gives an accurate answer regardless of when it is called. 175 | * 176 | */ 177 | CONSTEXPR bool empty() const noexcept { return this->n_rows() == 0; } 178 | 179 | /** Retrieves the number of rows that have been read so far */ 180 | CONSTEXPR size_t n_rows() const noexcept { return this->_n_rows; } 181 | 182 | /** Whether or not CSV was prefixed with a UTF-8 bom */ 183 | bool utf8_bom() const noexcept { return this->parser->utf8_bom(); } 184 | ///@} 185 | 186 | protected: 187 | /** 188 | * \defgroup csv_internal CSV Parser Internals 189 | * @brief Internals of CSVReader. Only maintainers and those looking to 190 | * extend the parser should read this. 191 | * @{ 192 | */ 193 | 194 | /** Sets this reader's column names and associated data */ 195 | void set_col_names(const std::vector&); 196 | 197 | /** @name CSV Settings **/ 198 | ///@{ 199 | CSVFormat _format; 200 | ///@} 201 | 202 | /** @name Parser State */ 203 | ///@{ 204 | /** Pointer to a object containing column information */ 205 | internals::ColNamesPtr col_names = std::make_shared(); 206 | 207 | /** Helper class which actually does the parsing */ 208 | std::unique_ptr parser = nullptr; 209 | 210 | /** Queue of parsed CSV rows */ 211 | std::unique_ptr records{new RowCollection(100)}; 212 | 213 | size_t n_cols = 0; /**< The number of columns in this CSV */ 214 | size_t _n_rows = 0; /**< How many rows (minus header) have been read so far */ 215 | 216 | /** @name Multi-Threaded File Reading Functions */ 217 | ///@{ 218 | bool read_csv(size_t bytes = internals::ITERATION_CHUNK_SIZE); 219 | ///@} 220 | 221 | /**@}*/ 222 | 223 | private: 224 | /** Whether or not rows before header were trimmed */ 225 | bool header_trimmed = false; 226 | 227 | /** @name Multi-Threaded File Reading: Flags and State */ 228 | ///@{ 229 | std::thread read_csv_worker; /**< Worker thread for read_csv() */ 230 | ///@} 231 | 232 | /** Read initial chunk to get metadata */ 233 | void initial_read() { 234 | this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); 235 | this->read_csv_worker.join(); 236 | } 237 | 238 | void trim_header(); 239 | }; 240 | } 241 | -------------------------------------------------------------------------------- /include/internal/csv_row.cpp: -------------------------------------------------------------------------------- 1 | /** @file 2 | * Defines the data type used for storing information about a CSV row 3 | */ 4 | 5 | #include 6 | #include 7 | #include "csv_row.hpp" 8 | 9 | namespace csv { 10 | namespace internals { 11 | CSV_INLINE RawCSVField& CSVFieldList::operator[](size_t n) const { 12 | const size_t page_no = n / _single_buffer_capacity; 13 | const size_t buffer_idx = (page_no < 1) ? n : n % _single_buffer_capacity; 14 | return this->buffers[page_no][buffer_idx]; 15 | } 16 | 17 | CSV_INLINE void CSVFieldList::allocate() { 18 | buffers.push_back(std::unique_ptr(new RawCSVField[_single_buffer_capacity])); 19 | 20 | _current_buffer_size = 0; 21 | _back = buffers.back().get(); 22 | } 23 | } 24 | 25 | /** Return a CSVField object corrsponding to the nth value in the row. 26 | * 27 | * @note This method performs bounds checking, and will throw an 28 | * `std::runtime_error` if n is invalid. 29 | * 30 | * @complexity 31 | * Constant, by calling csv::CSVRow::get_csv::string_view() 32 | * 33 | */ 34 | CSV_INLINE CSVField CSVRow::operator[](size_t n) const { 35 | return CSVField(this->get_field(n)); 36 | } 37 | 38 | /** Retrieve a value by its associated column name. If the column 39 | * specified can't be round, a runtime error is thrown. 40 | * 41 | * @complexity 42 | * Constant. This calls the other CSVRow::operator[]() after 43 | * converting column names into indices using a hash table. 44 | * 45 | * @param[in] col_name The column to look for 46 | */ 47 | CSV_INLINE CSVField CSVRow::operator[](const std::string& col_name) const { 48 | auto & col_names = this->data->col_names; 49 | auto col_pos = col_names->index_of(col_name); 50 | if (col_pos > -1) { 51 | return this->operator[](col_pos); 52 | } 53 | 54 | throw std::runtime_error("Can't find a column named " + col_name); 55 | } 56 | 57 | CSV_INLINE CSVRow::operator std::vector() const { 58 | std::vector ret; 59 | for (size_t i = 0; i < size(); i++) 60 | ret.push_back(std::string(this->get_field(i))); 61 | 62 | return ret; 63 | } 64 | 65 | CSV_INLINE csv::string_view CSVRow::get_field(size_t index) const 66 | { 67 | using internals::ParseFlags; 68 | 69 | if (index >= this->size()) 70 | throw std::runtime_error("Index out of bounds."); 71 | 72 | const size_t field_index = this->fields_start + index; 73 | auto& field = this->data->fields[field_index]; 74 | auto field_str = csv::string_view(this->data->data).substr(this->data_start + field.start); 75 | 76 | if (field.has_double_quote) { 77 | auto& value = this->data->double_quote_fields[field_index]; 78 | if (value.empty()) { 79 | bool prev_ch_quote = false; 80 | for (size_t i = 0; i < field.length; i++) { 81 | if (this->data->parse_flags[field_str[i] + CHAR_OFFSET] == ParseFlags::QUOTE) { 82 | if (prev_ch_quote) { 83 | prev_ch_quote = false; 84 | continue; 85 | } 86 | else { 87 | prev_ch_quote = true; 88 | } 89 | } 90 | 91 | value += field_str[i]; 92 | } 93 | } 94 | 95 | return csv::string_view(value); 96 | } 97 | 98 | return field_str.substr(0, field.length); 99 | } 100 | 101 | CSV_INLINE bool CSVField::try_parse_hex(int& parsedValue) { 102 | size_t start = 0, end = 0; 103 | 104 | // Trim out whitespace chars 105 | for (; start < this->sv.size() && this->sv[start] == ' '; start++); 106 | for (end = start; end < this->sv.size() && this->sv[end] != ' '; end++); 107 | 108 | int value_ = 0; 109 | 110 | size_t digits = (end - start); 111 | size_t base16_exponent = digits - 1; 112 | 113 | if (digits == 0) return false; 114 | 115 | for (const auto& ch : this->sv.substr(start, digits)) { 116 | int digit = 0; 117 | 118 | switch (ch) { 119 | case '0': 120 | case '1': 121 | case '2': 122 | case '3': 123 | case '4': 124 | case '5': 125 | case '6': 126 | case '7': 127 | case '8': 128 | case '9': 129 | digit = static_cast(ch - '0'); 130 | break; 131 | case 'a': 132 | case 'A': 133 | digit = 10; 134 | break; 135 | case 'b': 136 | case 'B': 137 | digit = 11; 138 | break; 139 | case 'c': 140 | case 'C': 141 | digit = 12; 142 | break; 143 | case 'd': 144 | case 'D': 145 | digit = 13; 146 | break; 147 | case 'e': 148 | case 'E': 149 | digit = 14; 150 | break; 151 | case 'f': 152 | case 'F': 153 | digit = 15; 154 | break; 155 | default: 156 | return false; 157 | } 158 | 159 | value_ += digit * (int)pow(16, (double)base16_exponent); 160 | base16_exponent--; 161 | } 162 | 163 | parsedValue = value_; 164 | return true; 165 | } 166 | 167 | CSV_INLINE bool CSVField::try_parse_decimal(long double& dVal, const char decimalSymbol) { 168 | // If field has already been parsed to empty, no need to do it aagin: 169 | if (this->_type == DataType::CSV_NULL) 170 | return false; 171 | 172 | // Not yet parsed or possibly parsed with other decimalSymbol 173 | if (this->_type == DataType::UNKNOWN || this->_type == DataType::CSV_STRING || this->_type == DataType::CSV_DOUBLE) 174 | this->_type = internals::data_type(this->sv, &this->value, decimalSymbol); // parse again 175 | 176 | // Integral types are not affected by decimalSymbol and need not be parsed again 177 | 178 | // Either we already had an integral type before, or we we just got any numeric type now. 179 | if (this->_type >= DataType::CSV_INT8 && this->_type <= DataType::CSV_DOUBLE) { 180 | dVal = this->value; 181 | return true; 182 | } 183 | 184 | // CSV_NULL or CSV_STRING, not numeric 185 | return false; 186 | } 187 | 188 | #ifdef _MSC_VER 189 | #pragma region CSVRow Iterator 190 | #endif 191 | /** Return an iterator pointing to the first field. */ 192 | CSV_INLINE CSVRow::iterator CSVRow::begin() const { 193 | return CSVRow::iterator(this, 0); 194 | } 195 | 196 | /** Return an iterator pointing to just after the end of the CSVRow. 197 | * 198 | * @warning Attempting to dereference the end iterator results 199 | * in dereferencing a null pointer. 200 | */ 201 | CSV_INLINE CSVRow::iterator CSVRow::end() const noexcept { 202 | return CSVRow::iterator(this, (int)this->size()); 203 | } 204 | 205 | CSV_INLINE CSVRow::reverse_iterator CSVRow::rbegin() const noexcept { 206 | return std::reverse_iterator(this->end()); 207 | } 208 | 209 | CSV_INLINE CSVRow::reverse_iterator CSVRow::rend() const { 210 | return std::reverse_iterator(this->begin()); 211 | } 212 | 213 | CSV_INLINE HEDLEY_NON_NULL(2) 214 | CSVRow::iterator::iterator(const CSVRow* _reader, int _i) 215 | : daddy(_reader), i(_i) { 216 | if (_i < (int)this->daddy->size()) 217 | this->field = std::make_shared( 218 | this->daddy->operator[](_i)); 219 | else 220 | this->field = nullptr; 221 | } 222 | 223 | CSV_INLINE CSVRow::iterator::reference CSVRow::iterator::operator*() const { 224 | return *(this->field.get()); 225 | } 226 | 227 | CSV_INLINE CSVRow::iterator::pointer CSVRow::iterator::operator->() const { 228 | return this->field; 229 | } 230 | 231 | CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator++() { 232 | // Pre-increment operator 233 | this->i++; 234 | if (this->i < (int)this->daddy->size()) 235 | this->field = std::make_shared( 236 | this->daddy->operator[](i)); 237 | else // Reached the end of row 238 | this->field = nullptr; 239 | return *this; 240 | } 241 | 242 | CSV_INLINE CSVRow::iterator CSVRow::iterator::operator++(int) { 243 | // Post-increment operator 244 | auto temp = *this; 245 | this->operator++(); 246 | return temp; 247 | } 248 | 249 | CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator--() { 250 | // Pre-decrement operator 251 | this->i--; 252 | this->field = std::make_shared( 253 | this->daddy->operator[](this->i)); 254 | return *this; 255 | } 256 | 257 | CSV_INLINE CSVRow::iterator CSVRow::iterator::operator--(int) { 258 | // Post-decrement operator 259 | auto temp = *this; 260 | this->operator--(); 261 | return temp; 262 | } 263 | 264 | CSV_INLINE CSVRow::iterator CSVRow::iterator::operator+(difference_type n) const { 265 | // Allows for iterator arithmetic 266 | return CSVRow::iterator(this->daddy, i + (int)n); 267 | } 268 | 269 | CSV_INLINE CSVRow::iterator CSVRow::iterator::operator-(difference_type n) const { 270 | // Allows for iterator arithmetic 271 | return CSVRow::iterator::operator+(-n); 272 | } 273 | #ifdef _MSC_VER 274 | #pragma endregion CSVRow Iterator 275 | #endif 276 | } 277 | -------------------------------------------------------------------------------- /include/internal/csv_stat.cpp: -------------------------------------------------------------------------------- 1 | /** @file 2 | * Calculates statistics from CSV files 3 | */ 4 | 5 | #include 6 | #include "csv_stat.hpp" 7 | 8 | namespace csv { 9 | /** Calculate statistics for an arbitrarily large file. When this constructor 10 | * is called, CSVStat will process the entire file iteratively. Once finished, 11 | * methods like get_mean(), get_counts(), etc... can be used to retrieve statistics. 12 | */ 13 | CSV_INLINE CSVStat::CSVStat(csv::string_view filename, CSVFormat format) : 14 | reader(filename, format) { 15 | this->calc(); 16 | } 17 | 18 | /** Calculate statistics for a CSV stored in a std::stringstream */ 19 | CSV_INLINE CSVStat::CSVStat(std::stringstream& stream, CSVFormat format) : 20 | reader(stream, format) { 21 | this->calc(); 22 | } 23 | 24 | /** Return current means */ 25 | CSV_INLINE std::vector CSVStat::get_mean() const { 26 | std::vector ret; 27 | for (size_t i = 0; i < this->get_col_names().size(); i++) { 28 | ret.push_back(this->rolling_means[i]); 29 | } 30 | return ret; 31 | } 32 | 33 | /** Return current variances */ 34 | CSV_INLINE std::vector CSVStat::get_variance() const { 35 | std::vector ret; 36 | for (size_t i = 0; i < this->get_col_names().size(); i++) { 37 | ret.push_back(this->rolling_vars[i]/(this->n[i] - 1)); 38 | } 39 | return ret; 40 | } 41 | 42 | /** Return current mins */ 43 | CSV_INLINE std::vector CSVStat::get_mins() const { 44 | std::vector ret; 45 | for (size_t i = 0; i < this->get_col_names().size(); i++) { 46 | ret.push_back(this->mins[i]); 47 | } 48 | return ret; 49 | } 50 | 51 | /** Return current maxes */ 52 | CSV_INLINE std::vector CSVStat::get_maxes() const { 53 | std::vector ret; 54 | for (size_t i = 0; i < this->get_col_names().size(); i++) { 55 | ret.push_back(this->maxes[i]); 56 | } 57 | return ret; 58 | } 59 | 60 | /** Get counts for each column */ 61 | CSV_INLINE std::vector CSVStat::get_counts() const { 62 | std::vector ret; 63 | for (size_t i = 0; i < this->get_col_names().size(); i++) { 64 | ret.push_back(this->counts[i]); 65 | } 66 | return ret; 67 | } 68 | 69 | /** Get data type counts for each column */ 70 | CSV_INLINE std::vector CSVStat::get_dtypes() const { 71 | std::vector ret; 72 | for (size_t i = 0; i < this->get_col_names().size(); i++) { 73 | ret.push_back(this->dtypes[i]); 74 | } 75 | return ret; 76 | } 77 | 78 | CSV_INLINE void CSVStat::calc_chunk() { 79 | /** Only create stats counters the first time **/ 80 | if (dtypes.empty()) { 81 | /** Go through all records and calculate specified statistics */ 82 | for (size_t i = 0; i < this->get_col_names().size(); i++) { 83 | dtypes.push_back({}); 84 | counts.push_back({}); 85 | rolling_means.push_back(0); 86 | rolling_vars.push_back(0); 87 | mins.push_back(NAN); 88 | maxes.push_back(NAN); 89 | n.push_back(0); 90 | } 91 | } 92 | 93 | // Start threads 94 | std::vector pool; 95 | for (size_t i = 0; i < this->get_col_names().size(); i++) 96 | pool.push_back(std::thread(&CSVStat::calc_worker, this, i)); 97 | 98 | // Block until done 99 | for (auto& th : pool) 100 | th.join(); 101 | 102 | this->records.clear(); 103 | } 104 | 105 | CSV_INLINE void CSVStat::calc() { 106 | constexpr size_t CALC_CHUNK_SIZE = 5000; 107 | 108 | for (auto& row : reader) { 109 | this->records.push_back(std::move(row)); 110 | 111 | /** Chunk rows */ 112 | if (this->records.size() == CALC_CHUNK_SIZE) { 113 | calc_chunk(); 114 | } 115 | } 116 | 117 | if (!this->records.empty()) { 118 | calc_chunk(); 119 | } 120 | } 121 | 122 | CSV_INLINE void CSVStat::calc_worker(const size_t &i) { 123 | /** Worker thread for CSVStat::calc() which calculates statistics for one column. 124 | * 125 | * @param[in] i Column index 126 | */ 127 | 128 | auto current_record = this->records.begin(); 129 | 130 | for (size_t processed = 0; current_record != this->records.end(); processed++) { 131 | if (current_record->size() == this->get_col_names().size()) { 132 | auto current_field = (*current_record)[i]; 133 | 134 | // Optimization: Don't count() if there's too many distinct values in the first 1000 rows 135 | if (processed < 1000 || this->counts[i].size() <= 500) 136 | this->count(current_field, i); 137 | 138 | this->dtype(current_field, i); 139 | 140 | // Numeric Stuff 141 | if (current_field.is_num()) { 142 | long double x_n = current_field.get(); 143 | 144 | // This actually calculates mean AND variance 145 | this->variance(x_n, i); 146 | this->min_max(x_n, i); 147 | } 148 | } 149 | else if (this->reader.get_format().get_variable_column_policy() == VariableColumnPolicy::THROW) { 150 | throw std::runtime_error("Line has different length than the others " + internals::format_row(*current_record)); 151 | } 152 | 153 | ++current_record; 154 | } 155 | } 156 | 157 | CSV_INLINE void CSVStat::dtype(CSVField& data, const size_t &i) { 158 | /** Given a record update the type counter 159 | * @param[in] record Data observation 160 | * @param[out] i The column index that should be updated 161 | */ 162 | 163 | auto type = data.type(); 164 | if (this->dtypes[i].find(type) != 165 | this->dtypes[i].end()) { 166 | // Increment count 167 | this->dtypes[i][type]++; 168 | } else { 169 | // Initialize count 170 | this->dtypes[i].insert(std::make_pair(type, 1)); 171 | } 172 | } 173 | 174 | CSV_INLINE void CSVStat::count(CSVField& data, const size_t &i) { 175 | /** Given a record update the frequency counter 176 | * @param[in] record Data observation 177 | * @param[out] i The column index that should be updated 178 | */ 179 | 180 | auto item = data.get(); 181 | 182 | if (this->counts[i].find(item) != 183 | this->counts[i].end()) { 184 | // Increment count 185 | this->counts[i][item]++; 186 | } else { 187 | // Initialize count 188 | this->counts[i].insert(std::make_pair(item, 1)); 189 | } 190 | } 191 | 192 | CSV_INLINE void CSVStat::min_max(const long double &x_n, const size_t &i) { 193 | /** Update current minimum and maximum 194 | * @param[in] x_n Data observation 195 | * @param[out] i The column index that should be updated 196 | */ 197 | if (std::isnan(this->mins[i])) 198 | this->mins[i] = x_n; 199 | if (std::isnan(this->maxes[i])) 200 | this->maxes[i] = x_n; 201 | 202 | if (x_n < this->mins[i]) 203 | this->mins[i] = x_n; 204 | else if (x_n > this->maxes[i]) 205 | this->maxes[i] = x_n; 206 | } 207 | 208 | CSV_INLINE void CSVStat::variance(const long double &x_n, const size_t &i) { 209 | /** Given a record update rolling mean and variance for all columns 210 | * using Welford's Algorithm 211 | * @param[in] x_n Data observation 212 | * @param[out] i The column index that should be updated 213 | */ 214 | long double& current_rolling_mean = this->rolling_means[i]; 215 | long double& current_rolling_var = this->rolling_vars[i]; 216 | long double& current_n = this->n[i]; 217 | long double delta; 218 | long double delta2; 219 | 220 | current_n++; 221 | 222 | if (current_n == 1) { 223 | current_rolling_mean = x_n; 224 | } else { 225 | delta = x_n - current_rolling_mean; 226 | current_rolling_mean += delta/current_n; 227 | delta2 = x_n - current_rolling_mean; 228 | current_rolling_var += delta*delta2; 229 | } 230 | } 231 | 232 | /** Useful for uploading CSV files to SQL databases. 233 | * 234 | * Return a data type for each column such that every value in a column can be 235 | * converted to the corresponding data type without data loss. 236 | * @param[in] filename The CSV file 237 | * 238 | * \return A mapping of column names to csv::DataType enums 239 | */ 240 | CSV_INLINE std::unordered_map csv_data_types(const std::string& filename) { 241 | CSVStat stat(filename); 242 | std::unordered_map csv_dtypes; 243 | 244 | auto col_names = stat.get_col_names(); 245 | auto temp = stat.get_dtypes(); 246 | 247 | for (size_t i = 0; i < stat.get_col_names().size(); i++) { 248 | auto& col = temp[i]; 249 | auto& col_name = col_names[i]; 250 | 251 | if (col[DataType::CSV_STRING]) 252 | csv_dtypes[col_name] = DataType::CSV_STRING; 253 | else if (col[DataType::CSV_INT64]) 254 | csv_dtypes[col_name] = DataType::CSV_INT64; 255 | else if (col[DataType::CSV_INT32]) 256 | csv_dtypes[col_name] = DataType::CSV_INT32; 257 | else if (col[DataType::CSV_INT16]) 258 | csv_dtypes[col_name] = DataType::CSV_INT16; 259 | else if (col[DataType::CSV_INT8]) 260 | csv_dtypes[col_name] = DataType::CSV_INT8; 261 | else 262 | csv_dtypes[col_name] = DataType::CSV_DOUBLE; 263 | } 264 | 265 | return csv_dtypes; 266 | } 267 | } -------------------------------------------------------------------------------- /include/internal/basic_csv_parser.cpp: -------------------------------------------------------------------------------- 1 | #include "basic_csv_parser.hpp" 2 | 3 | namespace csv { 4 | namespace internals { 5 | CSV_INLINE size_t get_file_size(csv::string_view filename) { 6 | std::ifstream infile(std::string(filename), std::ios::binary); 7 | const auto start = infile.tellg(); 8 | infile.seekg(0, std::ios::end); 9 | const auto end = infile.tellg(); 10 | 11 | return end - start; 12 | } 13 | 14 | CSV_INLINE std::string get_csv_head(csv::string_view filename) { 15 | return get_csv_head(filename, get_file_size(filename)); 16 | } 17 | 18 | CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) { 19 | const size_t bytes = 500000; 20 | 21 | std::error_code error; 22 | size_t length = std::min((size_t)file_size, bytes); 23 | auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error); 24 | 25 | if (error) { 26 | throw std::runtime_error("Cannot open file " + std::string(filename)); 27 | } 28 | 29 | return std::string(mmap.begin(), mmap.end()); 30 | } 31 | 32 | #ifdef _MSC_VER 33 | #pragma region IBasicCVParser 34 | #endif 35 | CSV_INLINE IBasicCSVParser::IBasicCSVParser( 36 | const CSVFormat& format, 37 | const ColNamesPtr& col_names 38 | ) : _col_names(col_names) { 39 | if (format.no_quote) { 40 | _parse_flags = internals::make_parse_flags(format.get_delim()); 41 | } 42 | else { 43 | _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char); 44 | } 45 | 46 | _ws_flags = internals::make_ws_flags( 47 | format.trim_chars.data(), format.trim_chars.size() 48 | ); 49 | } 50 | 51 | CSV_INLINE void IBasicCSVParser::end_feed() { 52 | using internals::ParseFlags; 53 | 54 | bool empty_last_field = this->data_ptr 55 | && this->data_ptr->_data 56 | && !this->data_ptr->data.empty() 57 | && (parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER 58 | || parse_flag(this->data_ptr->data.back()) == ParseFlags::QUOTE); 59 | 60 | // Push field 61 | if (this->field_length > 0 || empty_last_field) { 62 | this->push_field(); 63 | } 64 | 65 | // Push row 66 | if (this->current_row.size() > 0) 67 | this->push_row(); 68 | } 69 | 70 | CSV_INLINE void IBasicCSVParser::parse_field() noexcept { 71 | using internals::ParseFlags; 72 | auto& in = this->data_ptr->data; 73 | 74 | // Trim off leading whitespace 75 | while (data_pos < in.size() && ws_flag(in[data_pos])) 76 | data_pos++; 77 | 78 | if (field_start == UNINITIALIZED_FIELD) 79 | field_start = (int)(data_pos - current_row_start()); 80 | 81 | // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous 82 | // sequences, use the loop below to avoid having to go through the outer 83 | // switch statement as much as possible 84 | while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL) 85 | data_pos++; 86 | 87 | field_length = data_pos - (field_start + current_row_start()); 88 | 89 | // Trim off trailing whitespace, this->field_length constraint matters 90 | // when field is entirely whitespace 91 | for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--) 92 | this->field_length--; 93 | } 94 | 95 | CSV_INLINE void IBasicCSVParser::push_field() 96 | { 97 | // Update 98 | if (field_has_double_quote) { 99 | fields->emplace_back( 100 | field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, 101 | field_length, 102 | true 103 | ); 104 | field_has_double_quote = false; 105 | 106 | } 107 | else { 108 | fields->emplace_back( 109 | field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, 110 | field_length 111 | ); 112 | } 113 | 114 | current_row.row_length++; 115 | 116 | // Reset field state 117 | field_start = UNINITIALIZED_FIELD; 118 | field_length = 0; 119 | } 120 | 121 | /** @return The number of characters parsed that belong to complete rows */ 122 | CSV_INLINE size_t IBasicCSVParser::parse() 123 | { 124 | using internals::ParseFlags; 125 | 126 | this->quote_escape = false; 127 | this->data_pos = 0; 128 | this->current_row_start() = 0; 129 | this->trim_utf8_bom(); 130 | 131 | auto& in = this->data_ptr->data; 132 | while (this->data_pos < in.size()) { 133 | switch (compound_parse_flag(in[this->data_pos])) { 134 | case ParseFlags::DELIMITER: 135 | this->push_field(); 136 | this->data_pos++; 137 | break; 138 | 139 | case ParseFlags::NEWLINE: 140 | this->data_pos++; 141 | 142 | // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines) 143 | while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) 144 | this->data_pos++; 145 | 146 | // End of record -> Write record 147 | this->push_field(); 148 | this->push_row(); 149 | 150 | // Reset 151 | this->current_row = CSVRow(data_ptr, this->data_pos, fields->size()); 152 | break; 153 | 154 | case ParseFlags::NOT_SPECIAL: 155 | this->parse_field(); 156 | break; 157 | 158 | case ParseFlags::QUOTE_ESCAPE_QUOTE: 159 | if (data_pos + 1 == in.size()) return this->current_row_start(); 160 | else if (data_pos + 1 < in.size()) { 161 | auto next_ch = parse_flag(in[data_pos + 1]); 162 | if (next_ch >= ParseFlags::DELIMITER) { 163 | quote_escape = false; 164 | data_pos++; 165 | break; 166 | } 167 | else if (next_ch == ParseFlags::QUOTE) { 168 | // Case: Escaped quote 169 | data_pos += 2; 170 | this->field_length += 2; 171 | this->field_has_double_quote = true; 172 | break; 173 | } 174 | } 175 | 176 | // Case: Unescaped single quote => not strictly valid but we'll keep it 177 | this->field_length++; 178 | data_pos++; 179 | 180 | break; 181 | 182 | default: // Quote (currently not quote escaped) 183 | if (this->field_length == 0) { 184 | quote_escape = true; 185 | data_pos++; 186 | if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos])) 187 | field_start = (int)(data_pos - current_row_start()); 188 | break; 189 | } 190 | 191 | // Case: Unescaped quote 192 | this->field_length++; 193 | data_pos++; 194 | 195 | break; 196 | } 197 | } 198 | 199 | return this->current_row_start(); 200 | } 201 | 202 | CSV_INLINE void IBasicCSVParser::push_row() { 203 | current_row.row_length = fields->size() - current_row.fields_start; 204 | this->_records->push_back(std::move(current_row)); 205 | } 206 | 207 | CSV_INLINE void IBasicCSVParser::reset_data_ptr() { 208 | this->data_ptr = std::make_shared(); 209 | this->data_ptr->parse_flags = this->_parse_flags; 210 | this->data_ptr->col_names = this->_col_names; 211 | this->fields = &(this->data_ptr->fields); 212 | } 213 | 214 | CSV_INLINE void IBasicCSVParser::trim_utf8_bom() { 215 | auto& data = this->data_ptr->data; 216 | 217 | if (!this->unicode_bom_scan && data.size() >= 3) { 218 | if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') { 219 | this->data_pos += 3; // Remove BOM from input string 220 | this->_utf8_bom = true; 221 | } 222 | 223 | this->unicode_bom_scan = true; 224 | } 225 | } 226 | #ifdef _MSC_VER 227 | #pragma endregion 228 | #endif 229 | 230 | #ifdef _MSC_VER 231 | #pragma region Specializations 232 | #endif 233 | CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) { 234 | // Reset parser state 235 | this->field_start = UNINITIALIZED_FIELD; 236 | this->field_length = 0; 237 | this->reset_data_ptr(); 238 | 239 | // Create memory map 240 | size_t length = std::min(this->source_size - this->mmap_pos, bytes); 241 | std::error_code error; 242 | this->data_ptr->_data = std::make_shared>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error)); 243 | this->mmap_pos += length; 244 | if (error) throw error; 245 | 246 | auto mmap_ptr = (mio::basic_mmap_source*)(this->data_ptr->_data.get()); 247 | 248 | // Create string view 249 | this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length()); 250 | 251 | // Parse 252 | this->current_row = CSVRow(this->data_ptr); 253 | size_t remainder = this->parse(); 254 | 255 | if (this->mmap_pos == this->source_size || no_chunk()) { 256 | this->_eof = true; 257 | this->end_feed(); 258 | } 259 | 260 | this->mmap_pos -= (length - remainder); 261 | } 262 | #ifdef _MSC_VER 263 | #pragma endregion 264 | #endif 265 | } 266 | } 267 | -------------------------------------------------------------------------------- /include/internal/csv_reader.cpp: -------------------------------------------------------------------------------- 1 | /** @file 2 | * @brief Defines functionality needed for basic CSV parsing 3 | */ 4 | 5 | #include "csv_reader.hpp" 6 | 7 | namespace csv { 8 | namespace internals { 9 | CSV_INLINE std::string format_row(const std::vector& row, csv::string_view delim) { 10 | /** Print a CSV row */ 11 | std::stringstream ret; 12 | for (size_t i = 0; i < row.size(); i++) { 13 | ret << row[i]; 14 | if (i + 1 < row.size()) ret << delim; 15 | else ret << '\n'; 16 | } 17 | ret.flush(); 18 | 19 | return ret.str(); 20 | } 21 | 22 | /** Return a CSV's column names 23 | * 24 | * @param[in] filename Path to CSV file 25 | * @param[in] format Format of the CSV file 26 | * 27 | */ 28 | CSV_INLINE std::vector _get_col_names(csv::string_view head, CSVFormat format) { 29 | // Parse the CSV 30 | auto trim_chars = format.get_trim_chars(); 31 | std::stringstream source(head.data()); 32 | RowCollection rows; 33 | 34 | StreamParser parser(source, format); 35 | parser.set_output(rows); 36 | parser.next(); 37 | 38 | return CSVRow(std::move(rows[format.get_header()])); 39 | } 40 | 41 | CSV_INLINE GuessScore calculate_score(csv::string_view head, const CSVFormat& format) { 42 | // Frequency counter of row length 43 | std::unordered_map row_tally = { { 0, 0 } }; 44 | 45 | // Map row lengths to row num where they first occurred 46 | std::unordered_map row_when = { { 0, 0 } }; 47 | 48 | // Parse the CSV 49 | std::stringstream source(head.data()); 50 | RowCollection rows; 51 | 52 | StreamParser parser(source, format); 53 | parser.set_output(rows); 54 | parser.next(); 55 | 56 | for (size_t i = 0; i < rows.size(); i++) { 57 | auto& row = rows[i]; 58 | 59 | // Ignore zero-length rows 60 | if (row.size() > 0) { 61 | if (row_tally.find(row.size()) != row_tally.end()) { 62 | row_tally[row.size()]++; 63 | } 64 | else { 65 | row_tally[row.size()] = 1; 66 | row_when[row.size()] = i; 67 | } 68 | } 69 | } 70 | 71 | double final_score = 0; 72 | size_t header_row = 0; 73 | 74 | // Final score is equal to the largest 75 | // row size times rows of that size 76 | for (auto& pair : row_tally) { 77 | auto row_size = pair.first; 78 | auto row_count = pair.second; 79 | double score = (double)(row_size * row_count); 80 | if (score > final_score) { 81 | final_score = score; 82 | header_row = row_when[row_size]; 83 | } 84 | } 85 | 86 | return { 87 | final_score, 88 | header_row 89 | }; 90 | } 91 | 92 | /** Guess the delimiter used by a delimiter-separated values file */ 93 | CSV_INLINE CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims) { 94 | /** For each delimiter, find out which row length was most common. 95 | * The delimiter with the longest mode row length wins. 96 | * Then, the line number of the header row is the first row with 97 | * the mode row length. 98 | */ 99 | 100 | CSVFormat format; 101 | size_t max_score = 0, 102 | header = 0; 103 | char current_delim = delims[0]; 104 | 105 | for (char cand_delim : delims) { 106 | auto result = calculate_score(head, format.delimiter(cand_delim)); 107 | 108 | if ((size_t)result.score > max_score) { 109 | max_score = (size_t)result.score; 110 | current_delim = cand_delim; 111 | header = result.header; 112 | } 113 | } 114 | 115 | return { current_delim, (int)header }; 116 | } 117 | } 118 | 119 | /** Return a CSV's column names 120 | * 121 | * @param[in] filename Path to CSV file 122 | * @param[in] format Format of the CSV file 123 | * 124 | */ 125 | CSV_INLINE std::vector get_col_names(csv::string_view filename, CSVFormat format) { 126 | auto head = internals::get_csv_head(filename); 127 | 128 | /** Guess delimiter and header row */ 129 | if (format.guess_delim()) { 130 | auto guess_result = guess_format(filename, format.get_possible_delims()); 131 | format.delimiter(guess_result.delim).header_row(guess_result.header_row); 132 | } 133 | 134 | return internals::_get_col_names(head, format); 135 | } 136 | 137 | /** Guess the delimiter used by a delimiter-separated values file */ 138 | CSV_INLINE CSVGuessResult guess_format(csv::string_view filename, const std::vector& delims) { 139 | auto head = internals::get_csv_head(filename); 140 | return internals::_guess_format(head, delims); 141 | } 142 | 143 | /** Reads an arbitrarily large CSV file using memory-mapped IO. 144 | * 145 | * **Details:** Reads the first block of a CSV file synchronously to get information 146 | * such as column names and delimiting character. 147 | * 148 | * @param[in] filename Path to CSV file 149 | * @param[in] format Format of the CSV file 150 | * 151 | * \snippet tests/test_read_csv.cpp CSVField Example 152 | * 153 | */ 154 | CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : _format(format) { 155 | auto head = internals::get_csv_head(filename); 156 | using Parser = internals::MmapParser; 157 | 158 | /** Guess delimiter and header row */ 159 | if (format.guess_delim()) { 160 | auto guess_result = internals::_guess_format(head, format.possible_delimiters); 161 | format.delimiter(guess_result.delim); 162 | format.header = guess_result.header_row; 163 | this->_format = format; 164 | } 165 | 166 | if (!format.col_names.empty()) 167 | this->set_col_names(format.col_names); 168 | 169 | this->parser = std::unique_ptr(new Parser(filename, format, this->col_names)); // For C++11 170 | this->initial_read(); 171 | } 172 | 173 | /** Return the format of the original raw CSV */ 174 | CSV_INLINE CSVFormat CSVReader::get_format() const { 175 | CSVFormat new_format = this->_format; 176 | 177 | // Since users are normally not allowed to set 178 | // column names and header row simulatenously, 179 | // we will set the backing variables directly here 180 | new_format.col_names = this->col_names->get_col_names(); 181 | new_format.header = this->_format.header; 182 | 183 | return new_format; 184 | } 185 | 186 | /** Return the CSV's column names as a vector of strings. */ 187 | CSV_INLINE std::vector CSVReader::get_col_names() const { 188 | if (this->col_names) { 189 | return this->col_names->get_col_names(); 190 | } 191 | 192 | return std::vector(); 193 | } 194 | 195 | /** Return the index of the column name if found or 196 | * csv::CSV_NOT_FOUND otherwise. 197 | */ 198 | CSV_INLINE int CSVReader::index_of(csv::string_view col_name) const { 199 | auto _col_names = this->get_col_names(); 200 | for (size_t i = 0; i < _col_names.size(); i++) 201 | if (_col_names[i] == col_name) return (int)i; 202 | 203 | return CSV_NOT_FOUND; 204 | } 205 | 206 | CSV_INLINE void CSVReader::trim_header() { 207 | if (!this->header_trimmed) { 208 | for (int i = 0; i <= this->_format.header && !this->records->empty(); i++) { 209 | if (i == this->_format.header && this->col_names->empty()) { 210 | this->set_col_names(this->records->pop_front()); 211 | } 212 | else { 213 | this->records->pop_front(); 214 | } 215 | } 216 | 217 | this->header_trimmed = true; 218 | } 219 | } 220 | 221 | /** 222 | * @param[in] names Column names 223 | */ 224 | CSV_INLINE void CSVReader::set_col_names(const std::vector& names) 225 | { 226 | this->col_names->set_col_names(names); 227 | this->n_cols = names.size(); 228 | } 229 | 230 | /** 231 | * Read a chunk of CSV data. 232 | * 233 | * @note This method is meant to be run on its own thread. Only one `read_csv()` thread 234 | * should be active at a time. 235 | * 236 | * @param[in] bytes Number of bytes to read. 237 | * 238 | * @see CSVReader::read_csv_worker 239 | * @see CSVReader::read_row() 240 | */ 241 | CSV_INLINE bool CSVReader::read_csv(size_t bytes) { 242 | // Tell read_row() to listen for CSV rows 243 | this->records->notify_all(); 244 | 245 | this->parser->set_output(*this->records); 246 | this->parser->next(bytes); 247 | 248 | if (!this->header_trimmed) { 249 | this->trim_header(); 250 | } 251 | 252 | // Tell read_row() to stop waiting 253 | this->records->kill_all(); 254 | 255 | return true; 256 | } 257 | 258 | /** 259 | * Retrieve rows as CSVRow objects, returning true if more rows are available. 260 | * 261 | * @par Performance Notes 262 | * - Reads chunks of data that are csv::internals::ITERATION_CHUNK_SIZE bytes large at a time 263 | * - For performance details, read the documentation for CSVRow and CSVField. 264 | * 265 | * @param[out] row The variable where the parsed row will be stored 266 | * @see CSVRow, CSVField 267 | * 268 | * **Example:** 269 | * \snippet tests/test_read_csv.cpp CSVField Example 270 | * 271 | */ 272 | CSV_INLINE bool CSVReader::read_row(CSVRow &row) { 273 | while (true) { 274 | if (this->records->empty()) { 275 | if (this->records->is_waitable()) 276 | // Reading thread is currently active => wait for it to populate records 277 | this->records->wait(); 278 | else if (this->parser->eof()) 279 | // End of file and no more records 280 | return false; 281 | else { 282 | // Reading thread is not active => start another one 283 | if (this->read_csv_worker.joinable()) 284 | this->read_csv_worker.join(); 285 | 286 | this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); 287 | } 288 | } 289 | else if (this->records->front().size() != this->n_cols && 290 | this->_format.variable_column_policy != VariableColumnPolicy::KEEP) { 291 | auto errored_row = this->records->pop_front(); 292 | 293 | if (this->_format.variable_column_policy == VariableColumnPolicy::THROW) { 294 | if (errored_row.size() < this->n_cols) 295 | throw std::runtime_error("Line too short " + internals::format_row(errored_row)); 296 | 297 | throw std::runtime_error("Line too long " + internals::format_row(errored_row)); 298 | } 299 | } 300 | else { 301 | row = this->records->pop_front(); 302 | this->_n_rows++; 303 | return true; 304 | } 305 | } 306 | 307 | return false; 308 | } 309 | } 310 | --------------------------------------------------------------------------------