1#ifndef TAKANE_UTILS_CSV_HPP
2#define TAKANE_UTILS_CSV_HPP
4#include "ritsuko/ritsuko.hpp"
5#include "comservatory/comservatory.hpp"
7#include <unordered_set>
40 virtual comservatory::NumberField*
integer() {
47 virtual comservatory::StringField*
string() = 0;
52 virtual comservatory::NumberField*
number() = 0;
57 virtual comservatory::BooleanField*
boolean() = 0;
68 comservatory::StringField*
string() {
69 return new comservatory::DummyStringField;
72 comservatory::NumberField*
number() {
73 return new comservatory::DummyNumberField;
77 return new comservatory::DummyBooleanField;
89 std::vector<std::unique_ptr<comservatory::Field> >
fields;
94 void reconstitute(std::vector<std::unique_ptr<comservatory::Field> >& source) {
95 for (
size_t c = 0, total = source.size(); c < total; ++c) {
109struct CsvNameField :
public comservatory::StringField {
110 CsvNameField(
bool ar, comservatory::StringField* p) : as_rownames(ar), child(p) {}
114 comservatory::StringField* child;
118 throw std::runtime_error(
"missing values should not be present in the " + (as_rownames ? std::string(
"row names") : std::string(
"names")) +
" column");
121 void push_back(std::string x) {
122 child->push_back(std::move(x));
125 size_t size()
const {
126 return child->size();
129 bool filled()
const {
135constexpr T upper_integer_limit() {
140constexpr T lower_integer_limit() {
144struct CsvIntegerField :
public comservatory::NumberField {
145 CsvIntegerField(
int cid, comservatory::NumberField* p) : column_id(cid), child(p) {}
149 comservatory::NumberField* child;
153 child->add_missing();
156 void push_back(
double x) {
157 if (x < lower_integer_limit<double>() || x > upper_integer_limit<double>()) {
158 throw std::runtime_error(
"value in column " + std::to_string(column_id + 1) +
" does not fit inside a 32-bit signed integer");
160 if (x != std::floor(x)) {
161 throw std::runtime_error(
"value in column " + std::to_string(column_id + 1) +
" is not an integer");
166 size_t size()
const {
167 return child->size();
170 bool filled()
const {
175struct CsvNonNegativeIntegerField :
public comservatory::NumberField {
176 CsvNonNegativeIntegerField(
int cid, comservatory::NumberField* p) : column_id(cid), child(p) {}
180 comservatory::NumberField* child;
184 child->add_missing();
187 void push_back(
double x) {
189 throw std::runtime_error(
"value in column " + std::to_string(column_id + 1) +
" should not be negative");
191 if (x > upper_integer_limit<double>()) {
192 throw std::runtime_error(
"value in column " + std::to_string(column_id + 1) +
" does not fit inside a 32-bit signed integer");
194 if (x != std::floor(x)) {
195 throw std::runtime_error(
"value in column " + std::to_string(column_id + 1) +
" is not an integer");
200 size_t size()
const {
201 return child->size();
204 bool filled()
const {
209struct CsvDateField :
public comservatory::StringField {
210 CsvDateField(
int cid, comservatory::StringField* p) : column_id(cid), child(p) {}
214 comservatory::StringField* child;
217 void push_back(std::string x) {
218 if (!ritsuko::is_date(x.c_str(), x.size())) {
219 throw std::runtime_error(
"expected a date in column " + std::to_string(column_id + 1) +
", got '" + x +
"' instead");
221 child->push_back(std::move(x));
225 child->add_missing();
228 size_t size()
const {
229 return child->size();
232 bool filled()
const {
237struct CsvDateTimeField :
public comservatory::StringField {
238 CsvDateTimeField(
int cid, comservatory::StringField* p) : column_id(cid), child(p) {}
242 comservatory::StringField* child;
245 void push_back(std::string x) {
246 if (!ritsuko::is_rfc3339(x.c_str(), x.size())) {
247 throw std::runtime_error(
"expected an Internet date/time in column " + std::to_string(column_id + 1) +
", got '" + x +
"' instead");
249 child->push_back(std::move(x));
253 child->add_missing();
256 size_t size()
const {
257 return child->size();
260 bool filled()
const {
265struct CsvFactorV1Field :
public comservatory::StringField {
266 CsvFactorV1Field(
int cid,
const std::unordered_set<std::string>* l, comservatory::StringField* p) : column_id(cid), levels(l), child(p) {}
270 const std::unordered_set<std::string>* levels;
271 comservatory::StringField* child;
274 void push_back(std::string x) {
275 if (levels->find(x) == levels->end()) {
276 throw std::runtime_error(
"value '" + x +
"' in column " + std::to_string(column_id + 1) +
" does not refer to a valid level");
278 child->push_back(std::move(x));
282 child->add_missing();
285 size_t size()
const {
286 return child->size();
289 bool filled()
const {
294struct CsvFactorV2Field :
public comservatory::NumberField {
295 CsvFactorV2Field(
int cid,
size_t l, comservatory::NumberField* p) : column_id(cid), nlevels(l), child(p) {
296 if (nlevels > upper_integer_limit<size_t>()) {
297 throw std::runtime_error(
"number of levels must fit into a 32-bit signed integer");
304 comservatory::NumberField* child;
307 void push_back(
double x) {
308 if (x < 0 || x >= nlevels) {
309 throw std::runtime_error(
"code in column " + std::to_string(column_id + 1) +
" should be non-negative and less than the number of levels");
311 if (x != std::floor(x)) {
312 throw std::runtime_error(
"value in column " + std::to_string(column_id + 1) +
" is not an integer");
318 child->add_missing();
321 size_t size()
const {
322 return child->size();
325 bool filled()
const {
330struct CsvCompressedLengthField :
public CsvNonNegativeIntegerField {
331 CsvCompressedLengthField(
int cid, comservatory::NumberField* p) : CsvNonNegativeIntegerField(cid, p) {}
334 throw std::runtime_error(
"lengths should not be missing");
337 void push_back(
double x) {
338 CsvNonNegativeIntegerField::push_back(x);
339 total +=
static_cast<size_t>(x);
345struct CsvUniqueStringField :
public comservatory::StringField {
346 CsvUniqueStringField(
int cid, comservatory::StringField* p) : column_id(cid), child(p) {}
350 comservatory::StringField* child;
351 std::unordered_set<std::string> collected;
355 throw std::runtime_error(
"missing values should not be present in column " + std::to_string(column_id));
358 void push_back(std::string x) {
359 if (collected.find(x) != collected.end()) {
360 throw std::runtime_error(
"duplicated value '" + x +
"' in column " + std::to_string(column_id));
363 child->push_back(std::move(x));
366 size_t size()
const {
367 return child->size();
370 bool filled()
const {
takane validation functions.
Definition _derived_from.hpp:15
Contents of the parsed CSV.
Definition utils_csv.hpp:84
std::vector< std::unique_ptr< comservatory::Field > > fields
Definition utils_csv.hpp:89
Create comservatory::Field objects to capture column contents.
Definition utils_csv.hpp:27
virtual comservatory::NumberField * integer()
Definition utils_csv.hpp:40
virtual comservatory::NumberField * number()=0
virtual comservatory::BooleanField * boolean()=0
virtual comservatory::StringField * string()=0
Dummy column creator.
Definition utils_csv.hpp:67
comservatory::StringField * string()
Definition utils_csv.hpp:68
comservatory::BooleanField * boolean()
Definition utils_csv.hpp:76
comservatory::NumberField * number()
Definition utils_csv.hpp:72