1#ifndef TAKANE_CSV_DATA_FRAME_HPP
2#define TAKANE_CSV_DATA_FRAME_HPP
4#include "comservatory/comservatory.hpp"
6#include "WrappedOption.hpp"
10#include <unordered_set>
25namespace csv_data_frame {
44 WrappedOption<std::vector<data_frame::ColumnDetails> >
columns;
60template<
class ParseCommand>
63 if (creator == NULL) {
64 creator = &default_creator;
67 comservatory::Contents contents;
70 auto ptr = creator->
string();
71 output.
fields.emplace_back(ptr);
72 contents.fields.emplace_back(
new CsvNameField(
true, ptr));
75 const auto& columns = *(params.
columns);
76 size_t ncol = columns.size();
77 std::unordered_set<std::string> present;
79 for (
size_t c = 0; c < ncol; ++c) {
80 const auto& col = columns[c];
81 if (present.find(col.name) != present.end()) {
82 throw std::runtime_error(
"duplicate column name '" + col.name +
"'");
84 present.insert(col.name);
86 if (col.type == data_frame::ColumnType::INTEGER) {
88 output.
fields.emplace_back(ptr);
89 contents.fields.emplace_back(
new CsvIntegerField(c, ptr));
91 }
else if (col.type == data_frame::ColumnType::NUMBER) {
92 output.
fields.emplace_back(
nullptr);
93 contents.fields.emplace_back(creator->
number());
95 }
else if (col.type == data_frame::ColumnType::STRING) {
96 if (col.string_format == data_frame::StringFormat::DATE) {
97 auto ptr = creator->
string();
98 output.
fields.emplace_back(ptr);
99 contents.fields.emplace_back(
new CsvDateField(c, ptr));
101 }
else if (col.string_format == data_frame::StringFormat::DATE_TIME) {
102 auto ptr = creator->
string();
103 output.
fields.emplace_back(ptr);
104 contents.fields.emplace_back(
new CsvDateTimeField(c, ptr));
107 output.
fields.emplace_back(
nullptr);
108 contents.fields.emplace_back(creator->
string());
111 }
else if (col.type == data_frame::ColumnType::BOOLEAN) {
112 output.
fields.emplace_back(
nullptr);
113 contents.fields.emplace_back(creator->
boolean());
115 }
else if (col.type == data_frame::ColumnType::FACTOR) {
117 auto ptr = creator->
string();
118 output.
fields.emplace_back(ptr);
119 contents.fields.emplace_back(
new CsvFactorV1Field(c, col.factor_levels.get(), ptr));
122 output.
fields.emplace_back(ptr);
123 contents.fields.emplace_back(
new CsvFactorV2Field(c, col.factor_levels->size(), ptr));
126 }
else if (col.type == data_frame::ColumnType::OTHER) {
127 output.
fields.emplace_back(
nullptr);
128 contents.fields.emplace_back(
new comservatory::UnknownField);
131 throw std::runtime_error(
"unknown code for the expected column type");
135 comservatory::ReadOptions opt;
137 parse(contents, opt);
138 if (contents.num_records() != params.
num_rows) {
139 throw std::runtime_error(
"number of records in the CSV file does not match the expected number of rows");
142 for (
size_t c = 0; c < ncol; ++c) {
143 const auto& col = columns[c];
145 throw std::runtime_error(
"observed and expected header names do not match");
149 output.reconstitute(contents.fields);
172template<
class Reader>
174 return validate_base(
175 [&](comservatory::Contents& contents,
const comservatory::ReadOptions& opt) ->
void { comservatory::read(reader, contents, opt); },
192 return validate_base(
193 [&](comservatory::Contents& contents,
const comservatory::ReadOptions& opt) ->
void { comservatory::read_file(path, contents, opt); },
Validation for data frames.
CsvContents validate(Reader &reader, const Parameters ¶ms, CsvFieldCreator *creator=NULL)
Definition csv_data_frame.hpp:173
takane validation functions.
Definition _derived_from.hpp:15
Contents of the parsed CSV.
Definition utils_csv.hpp:84
std::vector< std::unique_ptr< comservatory::Field > > fields
Definition utils_csv.hpp:89
Create comservatory::Field objects to capture column contents.
Definition utils_csv.hpp:27
virtual comservatory::NumberField * integer()
Definition utils_csv.hpp:40
virtual comservatory::NumberField * number()=0
virtual comservatory::BooleanField * boolean()=0
virtual comservatory::StringField * string()=0
Dummy column creator.
Definition utils_csv.hpp:67
Parameters for validating the CSV data frame.
Definition csv_data_frame.hpp:30
WrappedOption< std::vector< data_frame::ColumnDetails > > columns
Definition csv_data_frame.hpp:44
bool has_row_names
Definition csv_data_frame.hpp:39
size_t num_rows
Definition csv_data_frame.hpp:34
int df_version
Definition csv_data_frame.hpp:54
bool parallel
Definition csv_data_frame.hpp:49
Utilities for parsing CSVs.