takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
csv_data_frame.hpp
Go to the documentation of this file.
1#ifndef TAKANE_CSV_DATA_FRAME_HPP
2#define TAKANE_CSV_DATA_FRAME_HPP
3
4#include "comservatory/comservatory.hpp"
5
6#include "WrappedOption.hpp"
7#include "data_frame.hpp"
8#include "utils_csv.hpp"
9
10#include <unordered_set>
11#include <string>
12#include <stdexcept>
13
19namespace takane {
20
25namespace csv_data_frame {
26
30struct Parameters {
34 size_t num_rows = 0;
35
39 bool has_row_names = false;
40
44 WrappedOption<std::vector<data_frame::ColumnDetails> > columns;
45
49 bool parallel = false;
50
54 int df_version = 2;
55};
56
60template<class ParseCommand>
61CsvContents validate_base(ParseCommand parse, const Parameters& params, CsvFieldCreator* creator) {
62 DummyCsvFieldCreator default_creator;
63 if (creator == NULL) {
64 creator = &default_creator;
65 }
66
67 comservatory::Contents contents;
68 CsvContents output;
69 if (params.has_row_names) {
70 auto ptr = creator->string();
71 output.fields.emplace_back(ptr);
72 contents.fields.emplace_back(new CsvNameField(true, ptr));
73 }
74
75 const auto& columns = *(params.columns);
76 size_t ncol = columns.size();
77 std::unordered_set<std::string> present;
78
79 for (size_t c = 0; c < ncol; ++c) {
80 const auto& col = columns[c];
81 if (present.find(col.name) != present.end()) {
82 throw std::runtime_error("duplicate column name '" + col.name + "'");
83 }
84 present.insert(col.name);
85
86 if (col.type == data_frame::ColumnType::INTEGER) {
87 auto ptr = creator->integer();
88 output.fields.emplace_back(ptr);
89 contents.fields.emplace_back(new CsvIntegerField(c, ptr));
90
91 } else if (col.type == data_frame::ColumnType::NUMBER) {
92 output.fields.emplace_back(nullptr);
93 contents.fields.emplace_back(creator->number());
94
95 } else if (col.type == data_frame::ColumnType::STRING) {
96 if (col.string_format == data_frame::StringFormat::DATE) {
97 auto ptr = creator->string();
98 output.fields.emplace_back(ptr);
99 contents.fields.emplace_back(new CsvDateField(c, ptr));
100
101 } else if (col.string_format == data_frame::StringFormat::DATE_TIME) {
102 auto ptr = creator->string();
103 output.fields.emplace_back(ptr);
104 contents.fields.emplace_back(new CsvDateTimeField(c, ptr));
105
106 } else {
107 output.fields.emplace_back(nullptr);
108 contents.fields.emplace_back(creator->string());
109 }
110
111 } else if (col.type == data_frame::ColumnType::BOOLEAN) {
112 output.fields.emplace_back(nullptr);
113 contents.fields.emplace_back(creator->boolean());
114
115 } else if (col.type == data_frame::ColumnType::FACTOR) {
116 if (params.df_version == 1) {
117 auto ptr = creator->string();
118 output.fields.emplace_back(ptr);
119 contents.fields.emplace_back(new CsvFactorV1Field(c, col.factor_levels.get(), ptr));
120 } else {
121 auto ptr = creator->integer();
122 output.fields.emplace_back(ptr);
123 contents.fields.emplace_back(new CsvFactorV2Field(c, col.factor_levels->size(), ptr));
124 }
125
126 } else if (col.type == data_frame::ColumnType::OTHER) {
127 output.fields.emplace_back(nullptr);
128 contents.fields.emplace_back(new comservatory::UnknownField); // This can be anything.
129
130 } else {
131 throw std::runtime_error("unknown code for the expected column type");
132 }
133 }
134
135 comservatory::ReadOptions opt;
136 opt.parallel = params.parallel;
137 parse(contents, opt);
138 if (contents.num_records() != params.num_rows) {
139 throw std::runtime_error("number of records in the CSV file does not match the expected number of rows");
140 }
141
142 for (size_t c = 0; c < ncol; ++c) {
143 const auto& col = columns[c];
144 if (col.name != contents.names[c + params.has_row_names]) {
145 throw std::runtime_error("observed and expected header names do not match");
146 }
147 }
148
149 output.reconstitute(contents.fields);
150 return output;
151}
172template<class Reader>
173CsvContents validate(Reader& reader, const Parameters& params, CsvFieldCreator* creator = NULL) {
174 return validate_base(
175 [&](comservatory::Contents& contents, const comservatory::ReadOptions& opt) -> void { comservatory::read(reader, contents, opt); },
176 params,
177 creator
178 );
179}
180
191inline CsvContents validate(const char* path, const Parameters& params, CsvFieldCreator* creator = NULL) {
192 return validate_base(
193 [&](comservatory::Contents& contents, const comservatory::ReadOptions& opt) -> void { comservatory::read_file(path, contents, opt); },
194 params,
195 creator
196 );
197}
198
199}
200
201}
202
203#endif
Validation for data frames.
CsvContents validate(Reader &reader, const Parameters &params, CsvFieldCreator *creator=NULL)
Definition csv_data_frame.hpp:173
takane validation functions.
Definition _derived_from.hpp:15
Contents of the parsed CSV.
Definition utils_csv.hpp:84
std::vector< std::unique_ptr< comservatory::Field > > fields
Definition utils_csv.hpp:89
Create comservatory::Field objects to capture column contents.
Definition utils_csv.hpp:27
virtual comservatory::NumberField * integer()
Definition utils_csv.hpp:40
virtual comservatory::NumberField * number()=0
virtual comservatory::BooleanField * boolean()=0
virtual comservatory::StringField * string()=0
Dummy column creator.
Definition utils_csv.hpp:67
Parameters for validating the CSV data frame.
Definition csv_data_frame.hpp:30
WrappedOption< std::vector< data_frame::ColumnDetails > > columns
Definition csv_data_frame.hpp:44
bool has_row_names
Definition csv_data_frame.hpp:39
size_t num_rows
Definition csv_data_frame.hpp:34
int df_version
Definition csv_data_frame.hpp:54
bool parallel
Definition csv_data_frame.hpp:49
Utilities for parsing CSVs.